dm-user.c 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * Copyright (C) 2020 Google, Inc
  4. * Copyright (C) 2020 Palmer Dabbelt <[email protected]>
  5. */
  6. #include <linux/device-mapper.h>
  7. #include <uapi/linux/dm-user.h>
  8. #include <linux/bio.h>
  9. #include <linux/init.h>
  10. #include <linux/mempool.h>
  11. #include <linux/miscdevice.h>
  12. #include <linux/module.h>
  13. #include <linux/poll.h>
  14. #include <linux/uio.h>
  15. #include <linux/wait.h>
  16. #include <linux/workqueue.h>
  17. #define DM_MSG_PREFIX "user"
  18. #define MAX_OUTSTANDING_MESSAGES 128
  19. static unsigned int daemon_timeout_msec = 4000;
  20. module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
  21. 0644);
  22. MODULE_PARM_DESC(dm_user_daemon_timeout_msec,
  23. "IO Timeout in msec if daemon does not process");
  24. /*
  25. * dm-user uses four structures:
  26. *
  27. * - "struct target", the outermost structure, corresponds to a single device
  28. * mapper target. This contains the set of outstanding BIOs that have been
  29. * provided by DM and are not actively being processed by the user, along
  30. * with a misc device that userspace can open to communicate with the
  31. * kernel. Each time userspaces opens the misc device a new channel is
  32. * created.
  33. * - "struct channel", which represents a single active communication channel
  34. * with userspace. Userspace may choose arbitrary read/write sizes to use
  35. * when processing messages, channels form these into logical accesses.
  36. * When userspace responds to a full message the channel completes the BIO
  37. * and obtains a new message to process from the target.
  38. * - "struct message", which wraps a BIO with the additional information
  39. * required by the kernel to sort out what to do with BIOs when they return
  40. * from userspace.
  41. * - "struct dm_user_message", which is the exact message format that
  42. * userspace sees.
  43. *
  44. * The hot path contains three distinct operations:
  45. *
  46. * - user_map(), which is provided a BIO from device mapper that is queued
  47. * into the target. This allocates and enqueues a new message.
  48. * - dev_read(), which dequeues a message, copies it to userspace.
  49. * - dev_write(), which looks up a message (keyed by sequence number) and
  50. * completes the corresponding BIO.
  51. *
  52. * Lock ordering (outer to inner)
  53. *
  54. * 1) miscdevice's global lock. This is held around dev_open, so it has to be
  55. * the outermost lock.
  56. * 2) target->lock
  57. * 3) channel->lock
  58. */
  59. struct message {
  60. /*
  61. * Messages themselves do not need a lock, they're protected by either
  62. * the target or channel's lock, depending on which can reference them
  63. * directly.
  64. */
  65. struct dm_user_message msg;
  66. struct bio *bio;
  67. size_t posn_to_user;
  68. size_t total_to_user;
  69. size_t posn_from_user;
  70. size_t total_from_user;
  71. struct list_head from_user;
  72. struct list_head to_user;
  73. /*
  74. * These are written back from the user. They live in the same spot in
  75. * the message, but we need to either keep the old values around or
  76. * call a bunch more BIO helpers. These are only valid after write has
  77. * adopted the message.
  78. */
  79. u64 return_type;
  80. u64 return_flags;
  81. struct delayed_work work;
  82. bool delayed;
  83. struct target *t;
  84. };
  85. struct target {
  86. /*
  87. * A target has a single lock, which protects everything in the target
  88. * (but does not protect the channels associated with a target).
  89. */
  90. struct mutex lock;
  91. /*
  92. * There is only one point at which anything blocks: userspace blocks
  93. * reading a new message, which is woken up by device mapper providing
  94. * a new BIO to process (or tearing down the target). The
  95. * corresponding write side doesn't block, instead we treat userspace's
  96. * response containing a message that has yet to be mapped as an
  97. * invalid operation.
  98. */
  99. struct wait_queue_head wq;
  100. /*
  101. * Messages are delivered to userspace in order, but may be returned
  102. * out of order. This allows userspace to schedule IO if it wants to.
  103. */
  104. mempool_t message_pool;
  105. u64 next_seq_to_map;
  106. u64 next_seq_to_user;
  107. struct list_head to_user;
  108. /*
  109. * There is a misc device per target. The name is selected by
  110. * userspace (via a DM create ioctl argument), and each ends up in
  111. * /dev/dm-user/. It looks like a better way to do this may be to have
  112. * a filesystem to manage these, but this was more expedient. The
  113. * current mechanism is functional, but does result in an arbitrary
  114. * number of dynamically created misc devices.
  115. */
  116. struct miscdevice miscdev;
  117. /*
  118. * Device mapper's target destructor triggers tearing this all down,
  119. * but we can't actually free until every channel associated with this
  120. * target has been destroyed. Channels each have a reference to their
  121. * target, and there is an additional single reference that corresponds
  122. * to both DM and the misc device (both of which are destroyed by DM).
  123. *
  124. * In the common case userspace will be asleep waiting for a new
  125. * message when device mapper decides to destroy the target, which
  126. * means no new messages will appear. The destroyed flag triggers a
  127. * wakeup, which will end up removing the reference.
  128. */
  129. struct kref references;
  130. int dm_destroyed;
  131. bool daemon_terminated;
  132. };
  133. struct channel {
  134. struct target *target;
  135. /*
  136. * A channel has a single lock, which prevents multiple reads (or
  137. * multiple writes) from conflicting with each other.
  138. */
  139. struct mutex lock;
  140. struct message *cur_to_user;
  141. struct message *cur_from_user;
  142. ssize_t to_user_error;
  143. ssize_t from_user_error;
  144. /*
  145. * Once a message has been forwarded to userspace on a channel it must
  146. * be responded to on the same channel. This allows us to error out
  147. * the messages that have not yet been responded to by a channel when
  148. * that channel closes, which makes handling errors more reasonable for
  149. * fault-tolerant userspace daemons. It also happens to make avoiding
  150. * shared locks between user_map() and dev_read() a lot easier.
  151. *
  152. * This does preclude a multi-threaded work stealing userspace
  153. * implementation (or at least, force a degree of head-of-line blocking
  154. * on the response path).
  155. */
  156. struct list_head from_user;
  157. /*
  158. * Responses from userspace can arrive in arbitrarily small chunks.
  159. * We need some place to buffer one up until we can find the
  160. * corresponding kernel-side message to continue processing, so instead
  161. * of allocating them we just keep one off to the side here. This can
  162. * only ever be pointer to by from_user_cur, and will never have a BIO.
  163. */
  164. struct message scratch_message_from_user;
  165. };
  166. static void message_kill(struct message *m, mempool_t *pool)
  167. {
  168. m->bio->bi_status = BLK_STS_IOERR;
  169. bio_endio(m->bio);
  170. mempool_free(m, pool);
  171. }
  172. static inline bool is_user_space_thread_present(struct target *t)
  173. {
  174. lockdep_assert_held(&t->lock);
  175. return (kref_read(&t->references) > 1);
  176. }
  177. static void process_delayed_work(struct work_struct *work)
  178. {
  179. struct delayed_work *del_work = to_delayed_work(work);
  180. struct message *msg = container_of(del_work, struct message, work);
  181. struct target *t = msg->t;
  182. mutex_lock(&t->lock);
  183. /*
  184. * There is at least one thread to process the IO.
  185. */
  186. if (is_user_space_thread_present(t)) {
  187. mutex_unlock(&t->lock);
  188. return;
  189. }
  190. /*
  191. * Terminate the IO with an error
  192. */
  193. list_del(&msg->to_user);
  194. pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
  195. msg->bio->bi_iter.bi_sector,
  196. t->miscdev.name);
  197. message_kill(msg, &t->message_pool);
  198. mutex_unlock(&t->lock);
  199. }
  200. static void enqueue_delayed_work(struct message *m, bool is_delay)
  201. {
  202. unsigned long delay = 0;
  203. m->delayed = true;
  204. INIT_DELAYED_WORK(&m->work, process_delayed_work);
  205. /*
  206. * Snapuserd daemon is the user-space process
  207. * which processes IO request from dm-user
  208. * when OTA is applied. Per the current design,
  209. * when a dm-user target is created, daemon
  210. * attaches to target and starts processing
  211. * the IO's. Daemon is terminated only when
  212. * dm-user target is destroyed.
  213. *
  214. * If for some reason, daemon crashes or terminates early,
  215. * without destroying the dm-user target; then
  216. * there is no mechanism to restart the daemon
  217. * and start processing the IO's from the same target.
  218. * Theoretically, it is possible but that infrastructure
  219. * doesn't exist in the android ecosystem.
  220. *
  221. * Thus, when the daemon terminates, there is no way the IO's
  222. * issued on that target will be processed. Hence,
  223. * we set the delay to 0 and fail the IO's immediately.
  224. *
  225. * On the other hand, when a new dm-user target is created,
  226. * we wait for the daemon to get attached for the first time.
  227. * This primarily happens when init first stage spins up
  228. * the daemon. At this point, since the snapshot device is mounted
  229. * of a root filesystem, dm-user target may receive IO request
  230. * even though daemon is not fully launched. We don't want
  231. * to fail those IO requests immediately. Thus, we queue these
  232. * requests with a timeout so that daemon is ready to process
  233. * those IO requests. Again, if the daemon fails to launch within
  234. * the timeout period, then IO's will be failed.
  235. */
  236. if (is_delay)
  237. delay = msecs_to_jiffies(daemon_timeout_msec);
  238. queue_delayed_work(system_wq, &m->work, delay);
  239. }
  240. static inline struct target *target_from_target(struct dm_target *target)
  241. {
  242. WARN_ON(target->private == NULL);
  243. return target->private;
  244. }
  245. static inline struct target *target_from_miscdev(struct miscdevice *miscdev)
  246. {
  247. return container_of(miscdev, struct target, miscdev);
  248. }
  249. static inline struct channel *channel_from_file(struct file *file)
  250. {
  251. WARN_ON(file->private_data == NULL);
  252. return file->private_data;
  253. }
  254. static inline struct target *target_from_channel(struct channel *c)
  255. {
  256. WARN_ON(c->target == NULL);
  257. return c->target;
  258. }
  259. static inline size_t bio_size(struct bio *bio)
  260. {
  261. struct bio_vec bvec;
  262. struct bvec_iter iter;
  263. size_t out = 0;
  264. bio_for_each_segment (bvec, bio, iter)
  265. out += bio_iter_len(bio, iter);
  266. return out;
  267. }
  268. static inline size_t bio_bytes_needed_to_user(struct bio *bio)
  269. {
  270. switch (bio_op(bio)) {
  271. case REQ_OP_WRITE:
  272. return sizeof(struct dm_user_message) + bio_size(bio);
  273. case REQ_OP_READ:
  274. case REQ_OP_FLUSH:
  275. case REQ_OP_DISCARD:
  276. case REQ_OP_SECURE_ERASE:
  277. case REQ_OP_WRITE_ZEROES:
  278. return sizeof(struct dm_user_message);
  279. /*
  280. * These ops are not passed to userspace under the assumption that
  281. * they're not going to be particularly useful in that context.
  282. */
  283. default:
  284. return -EOPNOTSUPP;
  285. }
  286. }
  287. static inline size_t bio_bytes_needed_from_user(struct bio *bio)
  288. {
  289. switch (bio_op(bio)) {
  290. case REQ_OP_READ:
  291. return sizeof(struct dm_user_message) + bio_size(bio);
  292. case REQ_OP_WRITE:
  293. case REQ_OP_FLUSH:
  294. case REQ_OP_DISCARD:
  295. case REQ_OP_SECURE_ERASE:
  296. case REQ_OP_WRITE_ZEROES:
  297. return sizeof(struct dm_user_message);
  298. /*
  299. * These ops are not passed to userspace under the assumption that
  300. * they're not going to be particularly useful in that context.
  301. */
  302. default:
  303. return -EOPNOTSUPP;
  304. }
  305. }
  306. static inline long bio_type_to_user_type(struct bio *bio)
  307. {
  308. switch (bio_op(bio)) {
  309. case REQ_OP_READ:
  310. return DM_USER_REQ_MAP_READ;
  311. case REQ_OP_WRITE:
  312. return DM_USER_REQ_MAP_WRITE;
  313. case REQ_OP_FLUSH:
  314. return DM_USER_REQ_MAP_FLUSH;
  315. case REQ_OP_DISCARD:
  316. return DM_USER_REQ_MAP_DISCARD;
  317. case REQ_OP_SECURE_ERASE:
  318. return DM_USER_REQ_MAP_SECURE_ERASE;
  319. case REQ_OP_WRITE_ZEROES:
  320. return DM_USER_REQ_MAP_WRITE_ZEROES;
  321. /*
  322. * These ops are not passed to userspace under the assumption that
  323. * they're not going to be particularly useful in that context.
  324. */
  325. default:
  326. return -EOPNOTSUPP;
  327. }
  328. }
  329. static inline long bio_flags_to_user_flags(struct bio *bio)
  330. {
  331. u64 out = 0;
  332. typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
  333. if (opf & REQ_FAILFAST_DEV) {
  334. opf &= ~REQ_FAILFAST_DEV;
  335. out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
  336. }
  337. if (opf & REQ_FAILFAST_TRANSPORT) {
  338. opf &= ~REQ_FAILFAST_TRANSPORT;
  339. out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
  340. }
  341. if (opf & REQ_FAILFAST_DRIVER) {
  342. opf &= ~REQ_FAILFAST_DRIVER;
  343. out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
  344. }
  345. if (opf & REQ_SYNC) {
  346. opf &= ~REQ_SYNC;
  347. out |= DM_USER_REQ_MAP_FLAG_SYNC;
  348. }
  349. if (opf & REQ_META) {
  350. opf &= ~REQ_META;
  351. out |= DM_USER_REQ_MAP_FLAG_META;
  352. }
  353. if (opf & REQ_PRIO) {
  354. opf &= ~REQ_PRIO;
  355. out |= DM_USER_REQ_MAP_FLAG_PRIO;
  356. }
  357. if (opf & REQ_NOMERGE) {
  358. opf &= ~REQ_NOMERGE;
  359. out |= DM_USER_REQ_MAP_FLAG_NOMERGE;
  360. }
  361. if (opf & REQ_IDLE) {
  362. opf &= ~REQ_IDLE;
  363. out |= DM_USER_REQ_MAP_FLAG_IDLE;
  364. }
  365. if (opf & REQ_INTEGRITY) {
  366. opf &= ~REQ_INTEGRITY;
  367. out |= DM_USER_REQ_MAP_FLAG_INTEGRITY;
  368. }
  369. if (opf & REQ_FUA) {
  370. opf &= ~REQ_FUA;
  371. out |= DM_USER_REQ_MAP_FLAG_FUA;
  372. }
  373. if (opf & REQ_PREFLUSH) {
  374. opf &= ~REQ_PREFLUSH;
  375. out |= DM_USER_REQ_MAP_FLAG_PREFLUSH;
  376. }
  377. if (opf & REQ_RAHEAD) {
  378. opf &= ~REQ_RAHEAD;
  379. out |= DM_USER_REQ_MAP_FLAG_RAHEAD;
  380. }
  381. if (opf & REQ_BACKGROUND) {
  382. opf &= ~REQ_BACKGROUND;
  383. out |= DM_USER_REQ_MAP_FLAG_BACKGROUND;
  384. }
  385. if (opf & REQ_NOWAIT) {
  386. opf &= ~REQ_NOWAIT;
  387. out |= DM_USER_REQ_MAP_FLAG_NOWAIT;
  388. }
  389. if (opf & REQ_NOUNMAP) {
  390. opf &= ~REQ_NOUNMAP;
  391. out |= DM_USER_REQ_MAP_FLAG_NOUNMAP;
  392. }
  393. if (unlikely(opf)) {
  394. pr_warn("unsupported BIO type %x\n", opf);
  395. return -EOPNOTSUPP;
  396. }
  397. WARN_ON(out < 0);
  398. return out;
  399. }
  400. /*
  401. * Not quite what's in blk-map.c, but instead what I thought the functions in
  402. * blk-map did. This one seems more generally useful and I think we could
  403. * write the blk-map version in terms of this one. The differences are that
  404. * this has a return value that counts, and blk-map uses the BIO _all iters.
  405. * Neither advance the BIO iter but don't advance the IOV iter, which is a bit
  406. * odd here.
  407. */
  408. static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
  409. {
  410. struct bio_vec bvec;
  411. struct bvec_iter biter;
  412. ssize_t out = 0;
  413. bio_for_each_segment (bvec, bio, biter) {
  414. ssize_t ret;
  415. ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
  416. bvec.bv_len, iter);
  417. /*
  418. * FIXME: I thought that IOV copies had a mechanism for
  419. * terminating early, if for example a signal came in while
  420. * sleeping waiting for a page to be mapped, but I don't see
  421. * where that would happen.
  422. */
  423. WARN_ON(ret < 0);
  424. out += ret;
  425. if (!iov_iter_count(iter))
  426. break;
  427. if (ret < bvec.bv_len)
  428. return ret;
  429. }
  430. return out;
  431. }
  432. static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter)
  433. {
  434. struct bio_vec bvec;
  435. struct bvec_iter biter;
  436. ssize_t out = 0;
  437. bio_for_each_segment (bvec, bio, biter) {
  438. ssize_t ret;
  439. ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
  440. bvec.bv_len, iter);
  441. /* as above */
  442. WARN_ON(ret < 0);
  443. out += ret;
  444. if (!iov_iter_count(iter))
  445. break;
  446. if (ret < bvec.bv_len)
  447. return ret;
  448. }
  449. return out;
  450. }
  451. static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to)
  452. {
  453. ssize_t copied = 0;
  454. if (!iov_iter_count(to))
  455. return 0;
  456. if (msg->posn_to_user < sizeof(msg->msg)) {
  457. copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
  458. sizeof(msg->msg) - msg->posn_to_user, to);
  459. } else {
  460. copied = bio_copy_to_iter(msg->bio, to);
  461. if (copied > 0)
  462. bio_advance(msg->bio, copied);
  463. }
  464. if (copied < 0)
  465. return copied;
  466. msg->posn_to_user += copied;
  467. return copied;
  468. }
  469. static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from)
  470. {
  471. ssize_t copied = 0;
  472. if (!iov_iter_count(from))
  473. return 0;
  474. if (msg->posn_from_user < sizeof(msg->msg)) {
  475. copied = copy_from_iter(
  476. (char *)(&msg->msg) + msg->posn_from_user,
  477. sizeof(msg->msg) - msg->posn_from_user, from);
  478. } else {
  479. copied = bio_copy_from_iter(msg->bio, from);
  480. if (copied > 0)
  481. bio_advance(msg->bio, copied);
  482. }
  483. if (copied < 0)
  484. return copied;
  485. msg->posn_from_user += copied;
  486. return copied;
  487. }
  488. static struct message *msg_get_map(struct target *t)
  489. {
  490. struct message *m;
  491. lockdep_assert_held(&t->lock);
  492. m = mempool_alloc(&t->message_pool, GFP_NOIO);
  493. m->msg.seq = t->next_seq_to_map++;
  494. INIT_LIST_HEAD(&m->to_user);
  495. INIT_LIST_HEAD(&m->from_user);
  496. return m;
  497. }
  498. static struct message *msg_get_to_user(struct target *t)
  499. {
  500. struct message *m;
  501. lockdep_assert_held(&t->lock);
  502. if (list_empty(&t->to_user))
  503. return NULL;
  504. m = list_first_entry(&t->to_user, struct message, to_user);
  505. list_del(&m->to_user);
  506. /*
  507. * If the IO was queued to workqueue since there
  508. * was no daemon to service the IO, then we
  509. * will have to cancel the delayed work as the
  510. * IO will be processed by this user-space thread.
  511. *
  512. * If the delayed work was already picked up for
  513. * processing, then wait for it to complete. Note
  514. * that the IO will not be terminated by the work
  515. * queue thread.
  516. */
  517. if (unlikely(m->delayed)) {
  518. mutex_unlock(&t->lock);
  519. cancel_delayed_work_sync(&m->work);
  520. mutex_lock(&t->lock);
  521. }
  522. return m;
  523. }
  524. static struct message *msg_get_from_user(struct channel *c, u64 seq)
  525. {
  526. struct message *m;
  527. struct list_head *cur, *tmp;
  528. lockdep_assert_held(&c->lock);
  529. list_for_each_safe (cur, tmp, &c->from_user) {
  530. m = list_entry(cur, struct message, from_user);
  531. if (m->msg.seq == seq) {
  532. list_del(&m->from_user);
  533. return m;
  534. }
  535. }
  536. return NULL;
  537. }
  538. /*
  539. * Returns 0 when there is no work left to do. This must be callable without
  540. * holding the target lock, as it is part of the waitqueue's check expression.
  541. * When called without the lock it may spuriously indicate there is remaining
  542. * work, but when called with the lock it must be accurate.
  543. */
  544. static int target_poll(struct target *t)
  545. {
  546. return !list_empty(&t->to_user) || t->dm_destroyed;
  547. }
  548. static void target_release(struct kref *ref)
  549. {
  550. struct target *t = container_of(ref, struct target, references);
  551. struct list_head *cur, *tmp;
  552. /*
  553. * There may be outstanding BIOs that have not yet been given to
  554. * userspace. At this point there's nothing we can do about them, as
  555. * there are and will never be any channels.
  556. */
  557. list_for_each_safe (cur, tmp, &t->to_user) {
  558. struct message *m = list_entry(cur, struct message, to_user);
  559. if (unlikely(m->delayed)) {
  560. bool ret;
  561. mutex_unlock(&t->lock);
  562. ret = cancel_delayed_work_sync(&m->work);
  563. mutex_lock(&t->lock);
  564. if (!ret)
  565. continue;
  566. }
  567. message_kill(m, &t->message_pool);
  568. }
  569. mempool_exit(&t->message_pool);
  570. mutex_unlock(&t->lock);
  571. mutex_destroy(&t->lock);
  572. kfree(t);
  573. }
  574. static void target_put(struct target *t)
  575. {
  576. /*
  577. * This both releases a reference to the target and the lock. We leave
  578. * it up to the caller to hold the lock, as they probably needed it for
  579. * something else.
  580. */
  581. lockdep_assert_held(&t->lock);
  582. if (!kref_put(&t->references, target_release)) {
  583. /*
  584. * User-space thread is getting terminated.
  585. * We need to scan the list for all those
  586. * pending IO's which were not processed yet
  587. * and put them back to work-queue for delayed
  588. * processing.
  589. */
  590. if (!is_user_space_thread_present(t)) {
  591. struct list_head *cur, *tmp;
  592. list_for_each_safe(cur, tmp, &t->to_user) {
  593. struct message *m = list_entry(cur,
  594. struct message,
  595. to_user);
  596. if (!m->delayed)
  597. enqueue_delayed_work(m, false);
  598. }
  599. /*
  600. * Daemon attached to this target is terminated.
  601. */
  602. t->daemon_terminated = true;
  603. }
  604. mutex_unlock(&t->lock);
  605. }
  606. }
  607. static struct channel *channel_alloc(struct target *t)
  608. {
  609. struct channel *c;
  610. lockdep_assert_held(&t->lock);
  611. c = kzalloc(sizeof(*c), GFP_KERNEL);
  612. if (c == NULL)
  613. return NULL;
  614. kref_get(&t->references);
  615. c->target = t;
  616. c->cur_from_user = &c->scratch_message_from_user;
  617. mutex_init(&c->lock);
  618. INIT_LIST_HEAD(&c->from_user);
  619. return c;
  620. }
  621. static void channel_free(struct channel *c)
  622. {
  623. struct list_head *cur, *tmp;
  624. lockdep_assert_held(&c->lock);
  625. /*
  626. * There may be outstanding BIOs that have been given to userspace but
  627. * have not yet been completed. The channel has been shut down so
  628. * there's no way to process the rest of those messages, so we just go
  629. * ahead and error out the BIOs. Hopefully whatever's on the other end
  630. * can handle the errors. One could imagine splitting the BIOs and
  631. * completing as much as we got, but that seems like overkill here.
  632. *
  633. * Our only other options would be to let the BIO hang around (which
  634. * seems way worse) or to resubmit it to userspace in the hope there's
  635. * another channel. I don't really like the idea of submitting a
  636. * message twice.
  637. */
  638. if (c->cur_to_user != NULL)
  639. message_kill(c->cur_to_user, &c->target->message_pool);
  640. if (c->cur_from_user != &c->scratch_message_from_user)
  641. message_kill(c->cur_from_user, &c->target->message_pool);
  642. list_for_each_safe (cur, tmp, &c->from_user)
  643. message_kill(list_entry(cur, struct message, from_user),
  644. &c->target->message_pool);
  645. mutex_lock(&c->target->lock);
  646. target_put(c->target);
  647. mutex_unlock(&c->lock);
  648. mutex_destroy(&c->lock);
  649. kfree(c);
  650. }
  651. static int dev_open(struct inode *inode, struct file *file)
  652. {
  653. struct channel *c;
  654. struct target *t;
  655. /*
  656. * This is called by miscdev, which sets private_data to point to the
  657. * struct miscdevice that was opened. The rest of our file operations
  658. * want to refer to the channel that's been opened, so we swap that
  659. * pointer out with a fresh channel.
  660. *
  661. * This is called with the miscdev lock held, which is also held while
  662. * registering/unregistering the miscdev. The miscdev must be
  663. * registered for this to get called, which means there must be an
  664. * outstanding reference to the target, which means it cannot be freed
  665. * out from under us despite us not holding a reference yet.
  666. */
  667. t = container_of(file->private_data, struct target, miscdev);
  668. mutex_lock(&t->lock);
  669. file->private_data = c = channel_alloc(t);
  670. if (c == NULL) {
  671. mutex_unlock(&t->lock);
  672. return -ENOMEM;
  673. }
  674. mutex_unlock(&t->lock);
  675. return 0;
  676. }
  677. static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to)
  678. {
  679. struct channel *c = channel_from_file(iocb->ki_filp);
  680. ssize_t total_processed = 0;
  681. ssize_t processed;
  682. mutex_lock(&c->lock);
  683. if (unlikely(c->to_user_error)) {
  684. total_processed = c->to_user_error;
  685. goto cleanup_unlock;
  686. }
  687. if (c->cur_to_user == NULL) {
  688. struct target *t = target_from_channel(c);
  689. mutex_lock(&t->lock);
  690. while (!target_poll(t)) {
  691. int e;
  692. mutex_unlock(&t->lock);
  693. mutex_unlock(&c->lock);
  694. e = wait_event_interruptible(t->wq, target_poll(t));
  695. mutex_lock(&c->lock);
  696. mutex_lock(&t->lock);
  697. if (unlikely(e != 0)) {
  698. /*
  699. * We haven't processed any bytes in either the
  700. * BIO or the IOV, so we can just terminate
  701. * right now. Elsewhere in the kernel handles
  702. * restarting the syscall when appropriate.
  703. */
  704. total_processed = e;
  705. mutex_unlock(&t->lock);
  706. goto cleanup_unlock;
  707. }
  708. }
  709. if (unlikely(t->dm_destroyed)) {
  710. /*
  711. * DM has destroyed this target, so just lock
  712. * the user out. There's really nothing else
  713. * we can do here. Note that we don't actually
  714. * tear any thing down until userspace has
  715. * closed the FD, as there may still be
  716. * outstanding BIOs.
  717. *
  718. * This is kind of a wacky error code to
  719. * return. My goal was really just to try and
  720. * find something that wasn't likely to be
  721. * returned by anything else in the miscdev
  722. * path. The message "block device required"
  723. * seems like a somewhat reasonable thing to
  724. * say when the target has disappeared out from
  725. * under us, but "not block" isn't sensible.
  726. */
  727. c->to_user_error = total_processed = -ENOTBLK;
  728. mutex_unlock(&t->lock);
  729. goto cleanup_unlock;
  730. }
  731. /*
  732. * Ensures that accesses to the message data are not ordered
  733. * before the remote accesses that produce that message data.
  734. *
  735. * This pairs with the barrier in user_map(), via the
  736. * conditional within the while loop above. Also see the lack
  737. * of barrier in user_dtr(), which is why this can be after the
  738. * destroyed check.
  739. */
  740. smp_rmb();
  741. c->cur_to_user = msg_get_to_user(t);
  742. WARN_ON(c->cur_to_user == NULL);
  743. mutex_unlock(&t->lock);
  744. }
  745. processed = msg_copy_to_iov(c->cur_to_user, to);
  746. total_processed += processed;
  747. WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
  748. if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
  749. struct message *m = c->cur_to_user;
  750. c->cur_to_user = NULL;
  751. list_add_tail(&m->from_user, &c->from_user);
  752. }
  753. cleanup_unlock:
  754. mutex_unlock(&c->lock);
  755. return total_processed;
  756. }
  757. static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from)
  758. {
  759. struct channel *c = channel_from_file(iocb->ki_filp);
  760. ssize_t total_processed = 0;
  761. ssize_t processed;
  762. mutex_lock(&c->lock);
  763. if (unlikely(c->from_user_error)) {
  764. total_processed = c->from_user_error;
  765. goto cleanup_unlock;
  766. }
  767. /*
  768. * cur_from_user can never be NULL. If there's no real message it must
  769. * point to the scratch space.
  770. */
  771. WARN_ON(c->cur_from_user == NULL);
  772. if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
  773. struct message *msg, *old;
  774. processed = msg_copy_from_iov(c->cur_from_user, from);
  775. if (processed <= 0) {
  776. pr_warn("msg_copy_from_iov() returned %zu\n",
  777. processed);
  778. c->from_user_error = -EINVAL;
  779. goto cleanup_unlock;
  780. }
  781. total_processed += processed;
  782. /*
  783. * In the unlikely event the user has provided us a very short
  784. * write, not even big enough to fill a message, just succeed.
  785. * We'll eventually build up enough bytes to do something.
  786. */
  787. if (unlikely(c->cur_from_user->posn_from_user <
  788. sizeof(struct dm_user_message)))
  789. goto cleanup_unlock;
  790. old = c->cur_from_user;
  791. mutex_lock(&c->target->lock);
  792. msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
  793. if (msg == NULL) {
  794. pr_info("user provided an invalid messag seq of %llx\n",
  795. old->msg.seq);
  796. mutex_unlock(&c->target->lock);
  797. c->from_user_error = -EINVAL;
  798. goto cleanup_unlock;
  799. }
  800. mutex_unlock(&c->target->lock);
  801. WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
  802. msg->posn_from_user = sizeof(struct dm_user_message);
  803. msg->return_type = old->msg.type;
  804. msg->return_flags = old->msg.flags;
  805. WARN_ON(msg->posn_from_user > msg->total_from_user);
  806. c->cur_from_user = msg;
  807. WARN_ON(old != &c->scratch_message_from_user);
  808. }
  809. /*
  810. * Userspace can signal an error for single requests by overwriting the
  811. * seq field.
  812. */
  813. switch (c->cur_from_user->return_type) {
  814. case DM_USER_RESP_SUCCESS:
  815. c->cur_from_user->bio->bi_status = BLK_STS_OK;
  816. break;
  817. case DM_USER_RESP_ERROR:
  818. case DM_USER_RESP_UNSUPPORTED:
  819. default:
  820. c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
  821. goto finish_bio;
  822. }
  823. /*
  824. * The op was a success as far as userspace is concerned, so process
  825. * whatever data may come along with it. The user may provide the BIO
  826. * data in multiple chunks, in which case we don't need to finish the
  827. * BIO.
  828. */
  829. processed = msg_copy_from_iov(c->cur_from_user, from);
  830. total_processed += processed;
  831. if (c->cur_from_user->posn_from_user <
  832. c->cur_from_user->total_from_user)
  833. goto cleanup_unlock;
  834. finish_bio:
  835. /*
  836. * When we set up this message the BIO's size matched the
  837. * message size, if that's not still the case then something
  838. * has gone off the rails.
  839. */
  840. WARN_ON(bio_size(c->cur_from_user->bio) != 0);
  841. bio_endio(c->cur_from_user->bio);
  842. /*
  843. * We don't actually need to take the target lock here, as all
  844. * we're doing is freeing the message and mempools have their
  845. * own lock. Each channel has its ows scratch message.
  846. */
  847. WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
  848. mempool_free(c->cur_from_user, &c->target->message_pool);
  849. c->scratch_message_from_user.posn_from_user = 0;
  850. c->cur_from_user = &c->scratch_message_from_user;
  851. cleanup_unlock:
  852. mutex_unlock(&c->lock);
  853. return total_processed;
  854. }
  855. static int dev_release(struct inode *inode, struct file *file)
  856. {
  857. struct channel *c;
  858. c = channel_from_file(file);
  859. mutex_lock(&c->lock);
  860. channel_free(c);
  861. return 0;
  862. }
  863. static const struct file_operations file_operations = {
  864. .owner = THIS_MODULE,
  865. .open = dev_open,
  866. .llseek = no_llseek,
  867. .read_iter = dev_read,
  868. .write_iter = dev_write,
  869. .release = dev_release,
  870. };
  871. static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  872. {
  873. struct target *t;
  874. int r;
  875. if (argc != 3) {
  876. ti->error = "Invalid argument count";
  877. r = -EINVAL;
  878. goto cleanup_none;
  879. }
  880. t = kzalloc(sizeof(*t), GFP_KERNEL);
  881. if (t == NULL) {
  882. r = -ENOMEM;
  883. goto cleanup_none;
  884. }
  885. ti->private = t;
  886. /* Enable more BIO types. */
  887. ti->num_discard_bios = 1;
  888. ti->discards_supported = true;
  889. ti->num_flush_bios = 1;
  890. ti->flush_supported = true;
  891. /*
  892. * We begin with a single reference to the target, which is miscdev's
  893. * reference. This ensures that the target won't be freed
  894. * until after the miscdev has been unregistered and all extant
  895. * channels have been closed.
  896. */
  897. kref_init(&t->references);
  898. t->daemon_terminated = false;
  899. mutex_init(&t->lock);
  900. init_waitqueue_head(&t->wq);
  901. INIT_LIST_HEAD(&t->to_user);
  902. mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
  903. sizeof(struct message));
  904. t->miscdev.minor = MISC_DYNAMIC_MINOR;
  905. t->miscdev.fops = &file_operations;
  906. t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
  907. if (t->miscdev.name == NULL) {
  908. r = -ENOMEM;
  909. goto cleanup_message_pool;
  910. }
  911. /*
  912. * Once the miscdev is registered it can be opened and therefor
  913. * concurrent references to the channel can happen. Holding the target
  914. * lock during misc_register() could deadlock. If registration
  915. * succeeds then we will not access the target again so we just stick a
  916. * barrier here, which pairs with taking the target lock everywhere
  917. * else the target is accessed.
  918. *
  919. * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks
  920. * would mean that we could take the target lock earlier and release it
  921. * here instead of the memory barrier. I'm not sure that's any better,
  922. * though, and this isn't on a hot path so it probably doesn't matter
  923. * either way.
  924. */
  925. smp_mb();
  926. r = misc_register(&t->miscdev);
  927. if (r) {
  928. DMERR("Unable to register miscdev %s for dm-user",
  929. t->miscdev.name);
  930. r = -ENOMEM;
  931. goto cleanup_misc_name;
  932. }
  933. return 0;
  934. cleanup_misc_name:
  935. kfree(t->miscdev.name);
  936. cleanup_message_pool:
  937. mempool_exit(&t->message_pool);
  938. kfree(t);
  939. cleanup_none:
  940. return r;
  941. }
  942. static void user_dtr(struct dm_target *ti)
  943. {
  944. struct target *t = target_from_target(ti);
  945. /*
  946. * Removes the miscdev. This must be called without the target lock
  947. * held to avoid a possible deadlock because our open implementation is
  948. * called holding the miscdev lock and must later take the target lock.
  949. *
  950. * There is no race here because only DM can register/unregister the
  951. * miscdev, and DM ensures that doesn't happen twice. The internal
  952. * miscdev lock is sufficient to ensure there are no races between
  953. * deregistering the miscdev and open.
  954. */
  955. misc_deregister(&t->miscdev);
  956. /*
  957. * We are now free to take the target's lock and drop our reference to
  958. * the target. There are almost certainly tasks sleeping in read on at
  959. * least one of the channels associated with this target, this
  960. * explicitly wakes them up and terminates the read.
  961. */
  962. mutex_lock(&t->lock);
  963. /*
  964. * No barrier here, as wait/wake ensures that the flag visibility is
  965. * correct WRT the wake/sleep state of the target tasks.
  966. */
  967. t->dm_destroyed = true;
  968. wake_up_all(&t->wq);
  969. target_put(t);
  970. }
  971. /*
  972. * Consumes a BIO from device mapper, queueing it up for userspace.
  973. */
  974. static int user_map(struct dm_target *ti, struct bio *bio)
  975. {
  976. struct target *t;
  977. struct message *entry;
  978. t = target_from_target(ti);
  979. /*
  980. * FIXME
  981. *
  982. * This seems like a bad idea. Specifically, here we're
  983. * directly on the IO path when we take the target lock, which may also
  984. * be taken from a user context. The user context doesn't actively
  985. * trigger anything that may sleep while holding the lock, but this
  986. * still seems like a bad idea.
  987. *
  988. * The obvious way to fix this would be to use a proper queue, which
  989. * would result in no shared locks between the direct IO path and user
  990. * tasks. I had a version that did this, but the head-of-line blocking
  991. * from the circular buffer resulted in us needing a fairly large
  992. * allocation in order to avoid situations in which the queue fills up
  993. * and everything goes off the rails.
  994. *
  995. * I could jump through a some hoops to avoid a shared lock while still
  996. * allowing for a large queue, but I'm not actually sure that allowing
  997. * for very large queues is the right thing to do here. Intuitively it
  998. * seems better to keep the queues small in here (essentially sized to
  999. * the user latency for performance reasons only) and rely on returning
  1000. * DM_MAPIO_REQUEUE regularly, as that would give the rest of the
  1001. * kernel more information.
  1002. *
  1003. * I'll spend some time trying to figure out what's going on with
  1004. * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
  1005. * this I'm all ears.
  1006. */
  1007. mutex_lock(&t->lock);
  1008. /*
  1009. * FIXME
  1010. *
  1011. * The assumption here is that there's no benefit to returning
  1012. * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
  1013. * sure that's actually true -- for example, I could imagine users
  1014. * expecting that submitted BIOs are unlikely to fail and therefor
  1015. * relying on submission failure to indicate an unsupported type.
  1016. *
  1017. * There's two ways I can think of to fix this:
  1018. * - Add DM arguments that are parsed during the constructor that
  1019. * allow various dm_target flags to be set that indicate the op
  1020. * types supported by this target. This may make sense for things
  1021. * like discard, where DM can already transform the BIOs to a form
  1022. * that's likely to be supported.
  1023. * - Some sort of pre-filter that allows userspace to hook in here
  1024. * and kill BIOs before marking them as submitted. My guess would
  1025. * be that a userspace round trip is a bad idea here, but a BPF
  1026. * call seems resonable.
  1027. *
  1028. * My guess is that we'd likely want to do both. The first one is easy
  1029. * and gives DM the proper info, so it seems better. The BPF call
  1030. * seems overly complex for just this, but one could imagine wanting to
  1031. * sometimes return _MAPPED and a BPF filter would be the way to do
  1032. * that.
  1033. *
  1034. * For example, in Android we have an in-kernel DM device called
  1035. * "dm-bow" that takes advange of some portion of the space that has
  1036. * been discarded on a device to provide opportunistic block-level
  1037. * backups. While one could imagine just implementing this entirely in
  1038. * userspace, that would come with an appreciable performance penalty.
  1039. * Instead one could keep a BPF program that forwards most accesses
  1040. * directly to the backing block device while informing a userspace
  1041. * daemon of any discarded space and on writes to blocks that are to be
  1042. * backed up.
  1043. */
  1044. if (unlikely((bio_type_to_user_type(bio) < 0) ||
  1045. (bio_flags_to_user_flags(bio) < 0))) {
  1046. mutex_unlock(&t->lock);
  1047. return DM_MAPIO_KILL;
  1048. }
  1049. entry = msg_get_map(t);
  1050. if (unlikely(entry == NULL)) {
  1051. mutex_unlock(&t->lock);
  1052. return DM_MAPIO_REQUEUE;
  1053. }
  1054. entry->msg.type = bio_type_to_user_type(bio);
  1055. entry->msg.flags = bio_flags_to_user_flags(bio);
  1056. entry->msg.sector = bio->bi_iter.bi_sector;
  1057. entry->msg.len = bio_size(bio);
  1058. entry->bio = bio;
  1059. entry->posn_to_user = 0;
  1060. entry->total_to_user = bio_bytes_needed_to_user(bio);
  1061. entry->posn_from_user = 0;
  1062. entry->total_from_user = bio_bytes_needed_from_user(bio);
  1063. entry->delayed = false;
  1064. entry->t = t;
  1065. /* Pairs with the barrier in dev_read() */
  1066. smp_wmb();
  1067. list_add_tail(&entry->to_user, &t->to_user);
  1068. /*
  1069. * If there is no daemon to process the IO's,
  1070. * queue these messages into a workqueue with
  1071. * a timeout.
  1072. */
  1073. if (!is_user_space_thread_present(t))
  1074. enqueue_delayed_work(entry, !t->daemon_terminated);
  1075. wake_up_interruptible(&t->wq);
  1076. mutex_unlock(&t->lock);
  1077. return DM_MAPIO_SUBMITTED;
  1078. }
  1079. static struct target_type user_target = {
  1080. .name = "user",
  1081. .version = { 1, 0, 0 },
  1082. .module = THIS_MODULE,
  1083. .ctr = user_ctr,
  1084. .dtr = user_dtr,
  1085. .map = user_map,
  1086. };
  1087. static int __init dm_user_init(void)
  1088. {
  1089. int r;
  1090. r = dm_register_target(&user_target);
  1091. if (r) {
  1092. DMERR("register failed %d", r);
  1093. goto error;
  1094. }
  1095. return 0;
  1096. error:
  1097. return r;
  1098. }
  1099. static void __exit dm_user_exit(void)
  1100. {
  1101. dm_unregister_target(&user_target);
  1102. }
  1103. module_init(dm_user_init);
  1104. module_exit(dm_user_exit);
  1105. MODULE_AUTHOR("Palmer Dabbelt <[email protected]>");
  1106. MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
  1107. MODULE_LICENSE("GPL");