// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2020 Google, Inc * Copyright (C) 2020 Palmer Dabbelt */ #include #include #include #include #include #include #include #include #include #include #define DM_MSG_PREFIX "user" #define MAX_OUTSTANDING_MESSAGES 128 /* * dm-user uses four structures: * * - "struct target", the outermost structure, corresponds to a single device * mapper target. This contains the set of outstanding BIOs that have been * provided by DM and are not actively being processed by the user, along * with a misc device that userspace can open to communicate with the * kernel. Each time userspaces opens the misc device a new channel is * created. * - "struct channel", which represents a single active communication channel * with userspace. Userspace may choose arbitrary read/write sizes to use * when processing messages, channels form these into logical accesses. * When userspace responds to a full message the channel completes the BIO * and obtains a new message to process from the target. * - "struct message", which wraps a BIO with the additional information * required by the kernel to sort out what to do with BIOs when they return * from userspace. * - "struct dm_user_message", which is the exact message format that * userspace sees. * * The hot path contains three distinct operations: * * - user_map(), which is provided a BIO from device mapper that is queued * into the target. This allocates and enqueues a new message. * - dev_read(), which dequeues a message, copies it to userspace. * - dev_write(), which looks up a message (keyed by sequence number) and * completes the corresponding BIO. * * Lock ordering (outer to inner) * * 1) miscdevice's global lock. This is held around dev_open, so it has to be * the outermost lock. * 2) target->lock * 3) channel->lock */ struct message { /* * Messages themselves do not need a lock, they're protected by either * the target or channel's lock, depending on which can reference them * directly. */ struct dm_user_message msg; struct bio *bio; size_t posn_to_user; size_t total_to_user; size_t posn_from_user; size_t total_from_user; struct list_head from_user; struct list_head to_user; /* * These are written back from the user. They live in the same spot in * the message, but we need to either keep the old values around or * call a bunch more BIO helpers. These are only valid after write has * adopted the message. */ u64 return_type; u64 return_flags; }; struct target { /* * A target has a single lock, which protects everything in the target * (but does not protect the channels associated with a target). */ struct mutex lock; /* * There is only one point at which anything blocks: userspace blocks * reading a new message, which is woken up by device mapper providing * a new BIO to process (or tearing down the target). The * corresponding write side doesn't block, instead we treat userspace's * response containing a message that has yet to be mapped as an * invalid operation. */ struct wait_queue_head wq; /* * Messages are delivered to userspace in order, but may be returned * out of order. This allows userspace to schedule IO if it wants to. */ mempool_t message_pool; u64 next_seq_to_map; u64 next_seq_to_user; struct list_head to_user; /* * There is a misc device per target. The name is selected by * userspace (via a DM create ioctl argument), and each ends up in * /dev/dm-user/. It looks like a better way to do this may be to have * a filesystem to manage these, but this was more expedient. The * current mechanism is functional, but does result in an arbitrary * number of dynamically created misc devices. */ struct miscdevice miscdev; /* * Device mapper's target destructor triggers tearing this all down, * but we can't actually free until every channel associated with this * target has been destroyed. Channels each have a reference to their * target, and there is an additional single reference that corresponds * to both DM and the misc device (both of which are destroyed by DM). * * In the common case userspace will be asleep waiting for a new * message when device mapper decides to destroy the target, which * means no new messages will appear. The destroyed flag triggers a * wakeup, which will end up removing the reference. */ struct kref references; int dm_destroyed; }; struct channel { struct target *target; /* * A channel has a single lock, which prevents multiple reads (or * multiple writes) from conflicting with each other. */ struct mutex lock; struct message *cur_to_user; struct message *cur_from_user; ssize_t to_user_error; ssize_t from_user_error; /* * Once a message has been forwarded to userspace on a channel it must * be responded to on the same channel. This allows us to error out * the messages that have not yet been responded to by a channel when * that channel closes, which makes handling errors more reasonable for * fault-tolerant userspace daemons. It also happens to make avoiding * shared locks between user_map() and dev_read() a lot easier. * * This does preclude a multi-threaded work stealing userspace * implementation (or at least, force a degree of head-of-line blocking * on the response path). */ struct list_head from_user; /* * Responses from userspace can arrive in arbitrarily small chunks. * We need some place to buffer one up until we can find the * corresponding kernel-side message to continue processing, so instead * of allocating them we just keep one off to the side here. This can * only ever be pointer to by from_user_cur, and will never have a BIO. */ struct message scratch_message_from_user; }; static inline struct target *target_from_target(struct dm_target *target) { WARN_ON(target->private == NULL); return target->private; } static inline struct target *target_from_miscdev(struct miscdevice *miscdev) { return container_of(miscdev, struct target, miscdev); } static inline struct channel *channel_from_file(struct file *file) { WARN_ON(file->private_data == NULL); return file->private_data; } static inline struct target *target_from_channel(struct channel *c) { WARN_ON(c->target == NULL); return c->target; } static inline size_t bio_size(struct bio *bio) { struct bio_vec bvec; struct bvec_iter iter; size_t out = 0; bio_for_each_segment (bvec, bio, iter) out += bio_iter_len(bio, iter); return out; } static inline size_t bio_bytes_needed_to_user(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: return sizeof(struct dm_user_message) + bio_size(bio); case REQ_OP_READ: case REQ_OP_FLUSH: case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: return sizeof(struct dm_user_message); /* * These ops are not passed to userspace under the assumption that * they're not going to be particularly useful in that context. */ default: return -EOPNOTSUPP; } } static inline size_t bio_bytes_needed_from_user(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_READ: return sizeof(struct dm_user_message) + bio_size(bio); case REQ_OP_WRITE: case REQ_OP_FLUSH: case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: return sizeof(struct dm_user_message); /* * These ops are not passed to userspace under the assumption that * they're not going to be particularly useful in that context. */ default: return -EOPNOTSUPP; } } static inline long bio_type_to_user_type(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_READ: return DM_USER_REQ_MAP_READ; case REQ_OP_WRITE: return DM_USER_REQ_MAP_WRITE; case REQ_OP_FLUSH: return DM_USER_REQ_MAP_FLUSH; case REQ_OP_DISCARD: return DM_USER_REQ_MAP_DISCARD; case REQ_OP_SECURE_ERASE: return DM_USER_REQ_MAP_SECURE_ERASE; case REQ_OP_WRITE_SAME: return DM_USER_REQ_MAP_WRITE_SAME; case REQ_OP_WRITE_ZEROES: return DM_USER_REQ_MAP_WRITE_ZEROES; /* * These ops are not passed to userspace under the assumption that * they're not going to be particularly useful in that context. */ default: return -EOPNOTSUPP; } } static inline long bio_flags_to_user_flags(struct bio *bio) { u64 out = 0; typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK; if (opf & REQ_FAILFAST_DEV) { opf &= ~REQ_FAILFAST_DEV; out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV; } if (opf & REQ_FAILFAST_TRANSPORT) { opf &= ~REQ_FAILFAST_TRANSPORT; out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT; } if (opf & REQ_FAILFAST_DRIVER) { opf &= ~REQ_FAILFAST_DRIVER; out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER; } if (opf & REQ_SYNC) { opf &= ~REQ_SYNC; out |= DM_USER_REQ_MAP_FLAG_SYNC; } if (opf & REQ_META) { opf &= ~REQ_META; out |= DM_USER_REQ_MAP_FLAG_META; } if (opf & REQ_PRIO) { opf &= ~REQ_PRIO; out |= DM_USER_REQ_MAP_FLAG_PRIO; } if (opf & REQ_NOMERGE) { opf &= ~REQ_NOMERGE; out |= DM_USER_REQ_MAP_FLAG_NOMERGE; } if (opf & REQ_IDLE) { opf &= ~REQ_IDLE; out |= DM_USER_REQ_MAP_FLAG_IDLE; } if (opf & REQ_INTEGRITY) { opf &= ~REQ_INTEGRITY; out |= DM_USER_REQ_MAP_FLAG_INTEGRITY; } if (opf & REQ_FUA) { opf &= ~REQ_FUA; out |= DM_USER_REQ_MAP_FLAG_FUA; } if (opf & REQ_PREFLUSH) { opf &= ~REQ_PREFLUSH; out |= DM_USER_REQ_MAP_FLAG_PREFLUSH; } if (opf & REQ_RAHEAD) { opf &= ~REQ_RAHEAD; out |= DM_USER_REQ_MAP_FLAG_RAHEAD; } if (opf & REQ_BACKGROUND) { opf &= ~REQ_BACKGROUND; out |= DM_USER_REQ_MAP_FLAG_BACKGROUND; } if (opf & REQ_NOWAIT) { opf &= ~REQ_NOWAIT; out |= DM_USER_REQ_MAP_FLAG_NOWAIT; } if (opf & REQ_NOUNMAP) { opf &= ~REQ_NOUNMAP; out |= DM_USER_REQ_MAP_FLAG_NOUNMAP; } if (unlikely(opf)) { pr_warn("unsupported BIO type %x\n", opf); return -EOPNOTSUPP; } WARN_ON(out < 0); return out; } /* * Not quite what's in blk-map.c, but instead what I thought the functions in * blk-map did. This one seems more generally useful and I think we could * write the blk-map version in terms of this one. The differences are that * this has a return value that counts, and blk-map uses the BIO _all iters. * Neither advance the BIO iter but don't advance the IOV iter, which is a bit * odd here. */ static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) { struct bio_vec bvec; struct bvec_iter biter; ssize_t out = 0; bio_for_each_segment (bvec, bio, biter) { ssize_t ret; ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset, bvec.bv_len, iter); /* * FIXME: I thought that IOV copies had a mechanism for * terminating early, if for example a signal came in while * sleeping waiting for a page to be mapped, but I don't see * where that would happen. */ WARN_ON(ret < 0); out += ret; if (!iov_iter_count(iter)) break; if (ret < bvec.bv_len) return ret; } return out; } static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter) { struct bio_vec bvec; struct bvec_iter biter; ssize_t out = 0; bio_for_each_segment (bvec, bio, biter) { ssize_t ret; ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset, bvec.bv_len, iter); /* as above */ WARN_ON(ret < 0); out += ret; if (!iov_iter_count(iter)) break; if (ret < bvec.bv_len) return ret; } return out; } static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to) { ssize_t copied = 0; if (!iov_iter_count(to)) return 0; if (msg->posn_to_user < sizeof(msg->msg)) { copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user, sizeof(msg->msg) - msg->posn_to_user, to); } else { copied = bio_copy_to_iter(msg->bio, to); if (copied > 0) bio_advance(msg->bio, copied); } if (copied < 0) return copied; msg->posn_to_user += copied; return copied; } static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from) { ssize_t copied = 0; if (!iov_iter_count(from)) return 0; if (msg->posn_from_user < sizeof(msg->msg)) { copied = copy_from_iter( (char *)(&msg->msg) + msg->posn_from_user, sizeof(msg->msg) - msg->posn_from_user, from); } else { copied = bio_copy_from_iter(msg->bio, from); if (copied > 0) bio_advance(msg->bio, copied); } if (copied < 0) return copied; msg->posn_from_user += copied; return copied; } static struct message *msg_get_map(struct target *t) { struct message *m; lockdep_assert_held(&t->lock); m = mempool_alloc(&t->message_pool, GFP_NOIO); m->msg.seq = t->next_seq_to_map++; INIT_LIST_HEAD(&m->to_user); INIT_LIST_HEAD(&m->from_user); return m; } static struct message *msg_get_to_user(struct target *t) { struct message *m; lockdep_assert_held(&t->lock); if (list_empty(&t->to_user)) return NULL; m = list_first_entry(&t->to_user, struct message, to_user); list_del(&m->to_user); return m; } static struct message *msg_get_from_user(struct channel *c, u64 seq) { struct message *m; struct list_head *cur, *tmp; lockdep_assert_held(&c->lock); list_for_each_safe (cur, tmp, &c->from_user) { m = list_entry(cur, struct message, from_user); if (m->msg.seq == seq) { list_del(&m->from_user); return m; } } return NULL; } static void message_kill(struct message *m, mempool_t *pool) { m->bio->bi_status = BLK_STS_IOERR; bio_endio(m->bio); bio_put(m->bio); mempool_free(m, pool); } /* * Returns 0 when there is no work left to do. This must be callable without * holding the target lock, as it is part of the waitqueue's check expression. * When called without the lock it may spuriously indicate there is remaining * work, but when called with the lock it must be accurate. */ static int target_poll(struct target *t) { return !list_empty(&t->to_user) || t->dm_destroyed; } static void target_release(struct kref *ref) { struct target *t = container_of(ref, struct target, references); struct list_head *cur, *tmp; /* * There may be outstanding BIOs that have not yet been given to * userspace. At this point there's nothing we can do about them, as * there are and will never be any channels. */ list_for_each_safe (cur, tmp, &t->to_user) { message_kill(list_entry(cur, struct message, to_user), &t->message_pool); } mempool_exit(&t->message_pool); mutex_unlock(&t->lock); mutex_destroy(&t->lock); kfree(t); } static void target_put(struct target *t) { /* * This both releases a reference to the target and the lock. We leave * it up to the caller to hold the lock, as they probably needed it for * something else. */ lockdep_assert_held(&t->lock); if (!kref_put(&t->references, target_release)) mutex_unlock(&t->lock); } static struct channel *channel_alloc(struct target *t) { struct channel *c; lockdep_assert_held(&t->lock); c = kzalloc(sizeof(*c), GFP_KERNEL); if (c == NULL) return NULL; kref_get(&t->references); c->target = t; c->cur_from_user = &c->scratch_message_from_user; mutex_init(&c->lock); INIT_LIST_HEAD(&c->from_user); return c; } static void channel_free(struct channel *c) { struct list_head *cur, *tmp; lockdep_assert_held(&c->lock); /* * There may be outstanding BIOs that have been given to userspace but * have not yet been completed. The channel has been shut down so * there's no way to process the rest of those messages, so we just go * ahead and error out the BIOs. Hopefully whatever's on the other end * can handle the errors. One could imagine splitting the BIOs and * completing as much as we got, but that seems like overkill here. * * Our only other options would be to let the BIO hang around (which * seems way worse) or to resubmit it to userspace in the hope there's * another channel. I don't really like the idea of submitting a * message twice. */ if (c->cur_to_user != NULL) message_kill(c->cur_to_user, &c->target->message_pool); if (c->cur_from_user != &c->scratch_message_from_user) message_kill(c->cur_from_user, &c->target->message_pool); list_for_each_safe (cur, tmp, &c->from_user) message_kill(list_entry(cur, struct message, from_user), &c->target->message_pool); mutex_lock(&c->target->lock); target_put(c->target); mutex_unlock(&c->lock); mutex_destroy(&c->lock); kfree(c); } static int dev_open(struct inode *inode, struct file *file) { struct channel *c; struct target *t; /* * This is called by miscdev, which sets private_data to point to the * struct miscdevice that was opened. The rest of our file operations * want to refer to the channel that's been opened, so we swap that * pointer out with a fresh channel. * * This is called with the miscdev lock held, which is also held while * registering/unregistering the miscdev. The miscdev must be * registered for this to get called, which means there must be an * outstanding reference to the target, which means it cannot be freed * out from under us despite us not holding a reference yet. */ t = container_of(file->private_data, struct target, miscdev); mutex_lock(&t->lock); file->private_data = c = channel_alloc(t); if (c == NULL) { mutex_unlock(&t->lock); return -ENOMEM; } mutex_unlock(&t->lock); return 0; } static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to) { struct channel *c = channel_from_file(iocb->ki_filp); ssize_t total_processed = 0; ssize_t processed; mutex_lock(&c->lock); if (unlikely(c->to_user_error)) { total_processed = c->to_user_error; goto cleanup_unlock; } if (c->cur_to_user == NULL) { struct target *t = target_from_channel(c); mutex_lock(&t->lock); while (!target_poll(t)) { int e; mutex_unlock(&t->lock); mutex_unlock(&c->lock); e = wait_event_interruptible(t->wq, target_poll(t)); mutex_lock(&c->lock); mutex_lock(&t->lock); if (unlikely(e != 0)) { /* * We haven't processed any bytes in either the * BIO or the IOV, so we can just terminate * right now. Elsewhere in the kernel handles * restarting the syscall when appropriate. */ total_processed = e; mutex_unlock(&t->lock); goto cleanup_unlock; } } if (unlikely(t->dm_destroyed)) { /* * DM has destroyed this target, so just lock * the user out. There's really nothing else * we can do here. Note that we don't actually * tear any thing down until userspace has * closed the FD, as there may still be * outstanding BIOs. * * This is kind of a wacky error code to * return. My goal was really just to try and * find something that wasn't likely to be * returned by anything else in the miscdev * path. The message "block device required" * seems like a somewhat reasonable thing to * say when the target has disappeared out from * under us, but "not block" isn't sensible. */ c->to_user_error = total_processed = -ENOTBLK; mutex_unlock(&t->lock); goto cleanup_unlock; } /* * Ensures that accesses to the message data are not ordered * before the remote accesses that produce that message data. * * This pairs with the barrier in user_map(), via the * conditional within the while loop above. Also see the lack * of barrier in user_dtr(), which is why this can be after the * destroyed check. */ smp_rmb(); c->cur_to_user = msg_get_to_user(t); WARN_ON(c->cur_to_user == NULL); mutex_unlock(&t->lock); } processed = msg_copy_to_iov(c->cur_to_user, to); total_processed += processed; WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user); if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) { struct message *m = c->cur_to_user; c->cur_to_user = NULL; list_add_tail(&m->from_user, &c->from_user); } cleanup_unlock: mutex_unlock(&c->lock); return total_processed; } static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from) { struct channel *c = channel_from_file(iocb->ki_filp); ssize_t total_processed = 0; ssize_t processed; mutex_lock(&c->lock); if (unlikely(c->from_user_error)) { total_processed = c->from_user_error; goto cleanup_unlock; } /* * cur_from_user can never be NULL. If there's no real message it must * point to the scratch space. */ WARN_ON(c->cur_from_user == NULL); if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) { struct message *msg, *old; processed = msg_copy_from_iov(c->cur_from_user, from); if (processed <= 0) { pr_warn("msg_copy_from_iov() returned %zu\n", processed); c->from_user_error = -EINVAL; goto cleanup_unlock; } total_processed += processed; /* * In the unlikely event the user has provided us a very short * write, not even big enough to fill a message, just succeed. * We'll eventually build up enough bytes to do something. */ if (unlikely(c->cur_from_user->posn_from_user < sizeof(struct dm_user_message))) goto cleanup_unlock; old = c->cur_from_user; mutex_lock(&c->target->lock); msg = msg_get_from_user(c, c->cur_from_user->msg.seq); if (msg == NULL) { pr_info("user provided an invalid messag seq of %llx\n", old->msg.seq); mutex_unlock(&c->target->lock); c->from_user_error = -EINVAL; goto cleanup_unlock; } mutex_unlock(&c->target->lock); WARN_ON(old->posn_from_user != sizeof(struct dm_user_message)); msg->posn_from_user = sizeof(struct dm_user_message); msg->return_type = old->msg.type; msg->return_flags = old->msg.flags; WARN_ON(msg->posn_from_user > msg->total_from_user); c->cur_from_user = msg; WARN_ON(old != &c->scratch_message_from_user); } /* * Userspace can signal an error for single requests by overwriting the * seq field. */ switch (c->cur_from_user->return_type) { case DM_USER_RESP_SUCCESS: c->cur_from_user->bio->bi_status = BLK_STS_OK; break; case DM_USER_RESP_ERROR: case DM_USER_RESP_UNSUPPORTED: default: c->cur_from_user->bio->bi_status = BLK_STS_IOERR; goto finish_bio; } /* * The op was a success as far as userspace is concerned, so process * whatever data may come along with it. The user may provide the BIO * data in multiple chunks, in which case we don't need to finish the * BIO. */ processed = msg_copy_from_iov(c->cur_from_user, from); total_processed += processed; if (c->cur_from_user->posn_from_user < c->cur_from_user->total_from_user) goto cleanup_unlock; finish_bio: /* * When we set up this message the BIO's size matched the * message size, if that's not still the case then something * has gone off the rails. */ WARN_ON(bio_size(c->cur_from_user->bio) != 0); bio_endio(c->cur_from_user->bio); bio_put(c->cur_from_user->bio); /* * We don't actually need to take the target lock here, as all * we're doing is freeing the message and mempools have their * own lock. Each channel has its ows scratch message. */ WARN_ON(c->cur_from_user == &c->scratch_message_from_user); mempool_free(c->cur_from_user, &c->target->message_pool); c->scratch_message_from_user.posn_from_user = 0; c->cur_from_user = &c->scratch_message_from_user; cleanup_unlock: mutex_unlock(&c->lock); return total_processed; } static int dev_release(struct inode *inode, struct file *file) { struct channel *c; c = channel_from_file(file); mutex_lock(&c->lock); channel_free(c); return 0; } static const struct file_operations file_operations = { .owner = THIS_MODULE, .open = dev_open, .llseek = no_llseek, .read_iter = dev_read, .write_iter = dev_write, .release = dev_release, }; static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct target *t; int r; if (argc != 3) { ti->error = "Invalid argument count"; r = -EINVAL; goto cleanup_none; } t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { r = -ENOMEM; goto cleanup_none; } ti->private = t; /* Enable more BIO types. */ ti->num_discard_bios = 1; ti->discards_supported = true; ti->num_flush_bios = 1; ti->flush_supported = true; /* * We begin with a single reference to the target, which is miscdev's * reference. This ensures that the target won't be freed * until after the miscdev has been unregistered and all extant * channels have been closed. */ kref_init(&t->references); mutex_init(&t->lock); init_waitqueue_head(&t->wq); INIT_LIST_HEAD(&t->to_user); mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES, sizeof(struct message)); t->miscdev.minor = MISC_DYNAMIC_MINOR; t->miscdev.fops = &file_operations; t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]); if (t->miscdev.name == NULL) { r = -ENOMEM; goto cleanup_message_pool; } /* * Once the miscdev is registered it can be opened and therefor * concurrent references to the channel can happen. Holding the target * lock during misc_register() could deadlock. If registration * succeeds then we will not access the target again so we just stick a * barrier here, which pairs with taking the target lock everywhere * else the target is accessed. * * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks * would mean that we could take the target lock earlier and release it * here instead of the memory barrier. I'm not sure that's any better, * though, and this isn't on a hot path so it probably doesn't matter * either way. */ smp_mb(); r = misc_register(&t->miscdev); if (r) { DMERR("Unable to register miscdev %s for dm-user", t->miscdev.name); r = -ENOMEM; goto cleanup_misc_name; } return 0; cleanup_misc_name: kfree(t->miscdev.name); cleanup_message_pool: mempool_exit(&t->message_pool); kfree(t); cleanup_none: return r; } static void user_dtr(struct dm_target *ti) { struct target *t = target_from_target(ti); /* * Removes the miscdev. This must be called without the target lock * held to avoid a possible deadlock because our open implementation is * called holding the miscdev lock and must later take the target lock. * * There is no race here because only DM can register/unregister the * miscdev, and DM ensures that doesn't happen twice. The internal * miscdev lock is sufficient to ensure there are no races between * deregistering the miscdev and open. */ misc_deregister(&t->miscdev); /* * We are now free to take the target's lock and drop our reference to * the target. There are almost certainly tasks sleeping in read on at * least one of the channels associated with this target, this * explicitly wakes them up and terminates the read. */ mutex_lock(&t->lock); /* * No barrier here, as wait/wake ensures that the flag visibility is * correct WRT the wake/sleep state of the target tasks. */ t->dm_destroyed = true; wake_up_all(&t->wq); target_put(t); } /* * Consumes a BIO from device mapper, queueing it up for userspace. */ static int user_map(struct dm_target *ti, struct bio *bio) { struct target *t; struct message *entry; t = target_from_target(ti); /* * FIXME * * This seems like a bad idea. Specifically, here we're * directly on the IO path when we take the target lock, which may also * be taken from a user context. The user context doesn't actively * trigger anything that may sleep while holding the lock, but this * still seems like a bad idea. * * The obvious way to fix this would be to use a proper queue, which * would result in no shared locks between the direct IO path and user * tasks. I had a version that did this, but the head-of-line blocking * from the circular buffer resulted in us needing a fairly large * allocation in order to avoid situations in which the queue fills up * and everything goes off the rails. * * I could jump through a some hoops to avoid a shared lock while still * allowing for a large queue, but I'm not actually sure that allowing * for very large queues is the right thing to do here. Intuitively it * seems better to keep the queues small in here (essentially sized to * the user latency for performance reasons only) and rely on returning * DM_MAPIO_REQUEUE regularly, as that would give the rest of the * kernel more information. * * I'll spend some time trying to figure out what's going on with * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix * this I'm all ears. */ mutex_lock(&t->lock); /* * FIXME * * The assumption here is that there's no benefit to returning * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not * sure that's actually true -- for example, I could imagine users * expecting that submitted BIOs are unlikely to fail and therefor * relying on submission failure to indicate an unsupported type. * * There's two ways I can think of to fix this: * - Add DM arguments that are parsed during the constructor that * allow various dm_target flags to be set that indicate the op * types supported by this target. This may make sense for things * like discard, where DM can already transform the BIOs to a form * that's likely to be supported. * - Some sort of pre-filter that allows userspace to hook in here * and kill BIOs before marking them as submitted. My guess would * be that a userspace round trip is a bad idea here, but a BPF * call seems resonable. * * My guess is that we'd likely want to do both. The first one is easy * and gives DM the proper info, so it seems better. The BPF call * seems overly complex for just this, but one could imagine wanting to * sometimes return _MAPPED and a BPF filter would be the way to do * that. * * For example, in Android we have an in-kernel DM device called * "dm-bow" that takes advange of some portion of the space that has * been discarded on a device to provide opportunistic block-level * backups. While one could imagine just implementing this entirely in * userspace, that would come with an appreciable performance penalty. * Instead one could keep a BPF program that forwards most accesses * directly to the backing block device while informing a userspace * daemon of any discarded space and on writes to blocks that are to be * backed up. */ if (unlikely((bio_type_to_user_type(bio) < 0) || (bio_flags_to_user_flags(bio) < 0))) { mutex_unlock(&t->lock); return DM_MAPIO_KILL; } entry = msg_get_map(t); if (unlikely(entry == NULL)) { mutex_unlock(&t->lock); return DM_MAPIO_REQUEUE; } bio_get(bio); entry->msg.type = bio_type_to_user_type(bio); entry->msg.flags = bio_flags_to_user_flags(bio); entry->msg.sector = bio->bi_iter.bi_sector; entry->msg.len = bio_size(bio); entry->bio = bio; entry->posn_to_user = 0; entry->total_to_user = bio_bytes_needed_to_user(bio); entry->posn_from_user = 0; entry->total_from_user = bio_bytes_needed_from_user(bio); /* Pairs with the barrier in dev_read() */ smp_wmb(); list_add_tail(&entry->to_user, &t->to_user); wake_up_interruptible(&t->wq); mutex_unlock(&t->lock); return DM_MAPIO_SUBMITTED; } static struct target_type user_target = { .name = "user", .version = { 1, 0, 0 }, .module = THIS_MODULE, .ctr = user_ctr, .dtr = user_dtr, .map = user_map, }; static int __init dm_user_init(void) { int r; r = dm_register_target(&user_target); if (r) { DMERR("register failed %d", r); goto error; } return 0; error: return r; } static void __exit dm_user_exit(void) { dm_unregister_target(&user_target); } module_init(dm_user_init); module_exit(dm_user_exit); MODULE_AUTHOR("Palmer Dabbelt "); MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace"); MODULE_LICENSE("GPL");