eventfd
eventfdは、実ファイルを有さないバッファによるpipeに係る操作に相当し、eventfdバッファは、pipeバッファのchar *でなくuint64_t file->private_data->countの整数。pipe多重書き込みはchar *の追加で、eventfd多重書き込みはuint64_tの加算とする。読込はfile->private_data->countの減算で0なら減算できない故、書込まれcount値が0でなくなるまでウエイトする。EFD_NONBLOCKならpipeと同様countが0でもreadでウエイトしない。
デフォルトの読込はfile->private_data->count値で、読込後はfile->private_data->count=0となる。EFD_SEMAPHOREでの読込は1とし、運用上file->private_data->countはread可能回数となる。
EFD_NONBLOCKでなら、file->private_data->count=0でもread()はfile->private_data->count!=0でもウエイトしない。
[root@north eventfd]# cat noblock.c
デフォルトの読込はfile->private_data->count値で、読込後はfile->private_data->count=0となる。EFD_SEMAPHOREでの読込は1とし、運用上file->private_data->countはread可能回数となる。
サンプル
[root@north eventfd]# cat semaphore.c#include <sys/eventfd.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include <string.h> void event_read(int efd, int init_value, int write_value); int main(int argc, char *argv[]) { int efd; uint64_t u; if (!strcmp(argv[2], "EFD_SEMAPHORE")) { efd = eventfd(3, EFD_SEMAPHORE | EFD_NONBLOCK); } if (!strcmp(argv[2], "NO_EFD_SEMAPHORE")) { efd = eventfd(3, EFD_NONBLOCK); } u = atoi(argv[1]); if (u) { write(efd, &u, sizeof(uint64_t)); } event_read(efd, 3, u); close(efd); } void event_read(int efd, int init_value, int write_value) { uint64_t u; int ret, cnt = 0; printf("%10s:%10s eventfd->count=%d+%d n", "read cnt", "read value", init_value, write_value); while (1) { u = -1; ret = read(efd, &u, sizeof(uint64_t)); if (ret != sizeof(uint64_t)) { printf("%10d:%10d n", ++cnt, u); break; } printf("%10d:%10d n", ++cnt, u); } printf("noread ret:%d n", ret); }[root@north eventfd]# ./semaphore.out 0 EFD_SEMAPHORE
read cnt:read value eventfd->count=3+0 1: 1 2: 1 3: 1 4: -1 noread ret:-1[root@north eventfd]# ./semaphore.out 2 EFD_SEMAPHORE
read cnt:read value eventfd->count=3+2 1: 1 2: 1 3: 1 4: 1 5: 1 6: -1 noread ret:-1[root@north eventfd]# ./semaphore.out 0 NO_EFD_SEMAPHORE
read cnt:read value eventfd->count=3+0 1: 3 2: -1 noread ret:-1[root@north eventfd]# ./semaphore.out 2 NO_EFD_SEMAPHORE
read cnt:read value eventfd->count=3+2 1: 5 2: -1 noread ret:-1
BLOCK
デフォルトのBLOCKは、file->private_data->count=0ならread()はfile->private_data->count!=0となるまでウエイト。EFD_NONBLOCKでなら、file->private_data->count=0でもread()はfile->private_data->count!=0でもウエイトしない。
[root@north eventfd]# cat noblock.c
#include <sys/eventfd.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include <string.h> void event_read(int efd); int main(int argc, char *argv[]) { int efd; uint64_t u; if (argc == 1) { printf("BLOCK n"); printf("NONBLOCK n"); exit(0); } if (!strcmp(argv[1], "BLOCK")) { efd = eventfd(0, 0); } if (!strcmp(argv[1], "NONBLOCK")) { efd = eventfd(0, EFD_NONBLOCK); } if (fork()) { sleep(1); printf("parent write start n"); u = 11; int s = write(efd, &u, sizeof(uint64_t)); printf("write:%d n", u); } else { printf("child read start n"); u = 0; read(efd, &u, sizeof(uint64_t)); printf("read :%d n", u); } close(efd); }
[root@north eventfd]# ./noblock.out NONBLOCK child read start read :0 parent write start write:11 [root@north eventfd]# ./noblock.out BLOCK child read start parent write start write:11 read :11
カーネル
struct eventfd_ctx { struct kref kref; wait_queue_head_t wqh; __u64 count; <- pipeバッファに相当 unsigned int flags; } priv; struct file { : void *private_data; : }; static const struct file_operations eventfd_fops = { .release = eventfd_release, .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, }; current->files[efd]=struct file *efile: __u64 efile->private_data->count; バッファ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd, error; struct file *file; error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS); if (error < 0) return error; fd = error; file = eventfd_file_create(count, flags); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; } fd_install(fd, file); return fd; err_put_unused_fd: put_unused_fd(fd); return error; } struct file *eventfd_file_create(unsigned int count, int flags) { struct file *file; struct eventfd_ctx *ctx; BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) return ERR_PTR(-EINVAL); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; <- 初期値は引数のcountのeventfdのread/writeバッファ ctx->flags = flags; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) eventfd_free_ctx(ctx); return file; } struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags) { struct qstr this; struct path path; struct file *file; int error; if (IS_ERR(anon_inode_inode)) return ERR_PTR(-ENODEV); if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); error = -ENOMEM; this.name = name; this.len = strlen(name); this.hash = 0; path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); if (!path.dentry) goto err_module; path.mnt = mntget(anon_inode_mnt); ihold(anon_inode_inode); d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->f_version = 0; file->private_data = priv; <--- priv = struct eventfd_ctx *ctx return file; err_dput: path_put(&path); err_module: module_put(fops->owner); return ERR_PTR(error); } #define ULLONG_MAX (~0ULL) file->f_op = eventfd_fops; static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) <- 書き込みサイズ8バイト以下ならエラー return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) <- 書き込み値が8バイトの最大値ならエラー return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) <- 書き込み済みctx->countへの書込み最大値のチェックでO_BLOCKのULLONG_MAXならウエイト res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ULLONG_MAX - ctx->count > ucnt) { res = sizeof(ucnt); break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (likely(res > 0)) { ctx->count += ucnt; <- 引数データ値の書き込み if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLIN); } spin_unlock_irq(&ctx->wqh.lock); return res; } static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 cnt; if (count < sizeof(cnt)) return -EINVAL; res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); if (res < 0) return res; return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); } ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { ssize_t res; DECLARE_WAITQUEUE(wait, current); spin_lock_irq(&ctx->wqh.lock); *cnt = 0; res = -EAGAIN; if (ctx->count > 0) res = 0; else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { <- ctx->count=0ならschedule()でsignal送信されるまでウエイト res = 0; break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (likely(res == 0)) { eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); return res; } static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1: ctx->count; <-EFD_SEMAPHORE:読込値はctx->count。NO_EFD_SEMAPHORE:読込値は1 ctx->count -= *cnt; }