dm-zone.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2021 Western Digital Corporation or its affiliates.
  4. */
  5. #include <linux/blkdev.h>
  6. #include <linux/mm.h>
  7. #include <linux/sched/mm.h>
  8. #include <linux/slab.h>
  9. #include "dm-core.h"
  10. #define DM_MSG_PREFIX "zone"
  11. #define DM_ZONE_INVALID_WP_OFST UINT_MAX
  12. /*
  13. * For internal zone reports bypassing the top BIO submission path.
  14. */
  15. static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
  16. sector_t sector, unsigned int nr_zones,
  17. report_zones_cb cb, void *data)
  18. {
  19. struct gendisk *disk = md->disk;
  20. int ret;
  21. struct dm_report_zones_args args = {
  22. .next_sector = sector,
  23. .orig_data = data,
  24. .orig_cb = cb,
  25. };
  26. do {
  27. struct dm_target *tgt;
  28. tgt = dm_table_find_target(t, args.next_sector);
  29. if (WARN_ON_ONCE(!tgt->type->report_zones))
  30. return -EIO;
  31. args.tgt = tgt;
  32. ret = tgt->type->report_zones(tgt, &args,
  33. nr_zones - args.zone_idx);
  34. if (ret < 0)
  35. return ret;
  36. } while (args.zone_idx < nr_zones &&
  37. args.next_sector < get_capacity(disk));
  38. return args.zone_idx;
  39. }
  40. /*
  41. * User facing dm device block device report zone operation. This calls the
  42. * report_zones operation for each target of a device table. This operation is
  43. * generally implemented by targets using dm_report_zones().
  44. */
  45. int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
  46. unsigned int nr_zones, report_zones_cb cb, void *data)
  47. {
  48. struct mapped_device *md = disk->private_data;
  49. struct dm_table *map;
  50. int srcu_idx, ret;
  51. if (dm_suspended_md(md))
  52. return -EAGAIN;
  53. map = dm_get_live_table(md, &srcu_idx);
  54. if (!map)
  55. return -EIO;
  56. ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
  57. dm_put_live_table(md, srcu_idx);
  58. return ret;
  59. }
  60. static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
  61. void *data)
  62. {
  63. struct dm_report_zones_args *args = data;
  64. sector_t sector_diff = args->tgt->begin - args->start;
  65. /*
  66. * Ignore zones beyond the target range.
  67. */
  68. if (zone->start >= args->start + args->tgt->len)
  69. return 0;
  70. /*
  71. * Remap the start sector and write pointer position of the zone
  72. * to match its position in the target range.
  73. */
  74. zone->start += sector_diff;
  75. if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
  76. if (zone->cond == BLK_ZONE_COND_FULL)
  77. zone->wp = zone->start + zone->len;
  78. else if (zone->cond == BLK_ZONE_COND_EMPTY)
  79. zone->wp = zone->start;
  80. else
  81. zone->wp += sector_diff;
  82. }
  83. args->next_sector = zone->start + zone->len;
  84. return args->orig_cb(zone, args->zone_idx++, args->orig_data);
  85. }
  86. /*
  87. * Helper for drivers of zoned targets to implement struct target_type
  88. * report_zones operation.
  89. */
  90. int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
  91. struct dm_report_zones_args *args, unsigned int nr_zones)
  92. {
  93. /*
  94. * Set the target mapping start sector first so that
  95. * dm_report_zones_cb() can correctly remap zone information.
  96. */
  97. args->start = start;
  98. return blkdev_report_zones(bdev, sector, nr_zones,
  99. dm_report_zones_cb, args);
  100. }
  101. EXPORT_SYMBOL_GPL(dm_report_zones);
  102. bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
  103. {
  104. struct request_queue *q = md->queue;
  105. if (!blk_queue_is_zoned(q))
  106. return false;
  107. switch (bio_op(bio)) {
  108. case REQ_OP_WRITE_ZEROES:
  109. case REQ_OP_WRITE:
  110. return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
  111. default:
  112. return false;
  113. }
  114. }
  115. void dm_cleanup_zoned_dev(struct mapped_device *md)
  116. {
  117. if (md->disk) {
  118. kfree(md->disk->conv_zones_bitmap);
  119. md->disk->conv_zones_bitmap = NULL;
  120. kfree(md->disk->seq_zones_wlock);
  121. md->disk->seq_zones_wlock = NULL;
  122. }
  123. kvfree(md->zwp_offset);
  124. md->zwp_offset = NULL;
  125. md->nr_zones = 0;
  126. }
  127. static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
  128. {
  129. switch (zone->cond) {
  130. case BLK_ZONE_COND_IMP_OPEN:
  131. case BLK_ZONE_COND_EXP_OPEN:
  132. case BLK_ZONE_COND_CLOSED:
  133. return zone->wp - zone->start;
  134. case BLK_ZONE_COND_FULL:
  135. return zone->len;
  136. case BLK_ZONE_COND_EMPTY:
  137. case BLK_ZONE_COND_NOT_WP:
  138. case BLK_ZONE_COND_OFFLINE:
  139. case BLK_ZONE_COND_READONLY:
  140. default:
  141. /*
  142. * Conventional, offline and read-only zones do not have a valid
  143. * write pointer. Use 0 as for an empty zone.
  144. */
  145. return 0;
  146. }
  147. }
  148. static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
  149. void *data)
  150. {
  151. struct mapped_device *md = data;
  152. struct gendisk *disk = md->disk;
  153. switch (zone->type) {
  154. case BLK_ZONE_TYPE_CONVENTIONAL:
  155. if (!disk->conv_zones_bitmap) {
  156. disk->conv_zones_bitmap =
  157. kcalloc(BITS_TO_LONGS(disk->nr_zones),
  158. sizeof(unsigned long), GFP_NOIO);
  159. if (!disk->conv_zones_bitmap)
  160. return -ENOMEM;
  161. }
  162. set_bit(idx, disk->conv_zones_bitmap);
  163. break;
  164. case BLK_ZONE_TYPE_SEQWRITE_REQ:
  165. case BLK_ZONE_TYPE_SEQWRITE_PREF:
  166. if (!disk->seq_zones_wlock) {
  167. disk->seq_zones_wlock =
  168. kcalloc(BITS_TO_LONGS(disk->nr_zones),
  169. sizeof(unsigned long), GFP_NOIO);
  170. if (!disk->seq_zones_wlock)
  171. return -ENOMEM;
  172. }
  173. if (!md->zwp_offset) {
  174. md->zwp_offset =
  175. kvcalloc(disk->nr_zones, sizeof(unsigned int),
  176. GFP_KERNEL);
  177. if (!md->zwp_offset)
  178. return -ENOMEM;
  179. }
  180. md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
  181. break;
  182. default:
  183. DMERR("Invalid zone type 0x%x at sectors %llu",
  184. (int)zone->type, zone->start);
  185. return -ENODEV;
  186. }
  187. return 0;
  188. }
  189. /*
  190. * Revalidate the zones of a mapped device to initialize resource necessary
  191. * for zone append emulation. Note that we cannot simply use the block layer
  192. * blk_revalidate_disk_zones() function here as the mapped device is suspended
  193. * (this is called from __bind() context).
  194. */
  195. static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
  196. {
  197. struct gendisk *disk = md->disk;
  198. unsigned int noio_flag;
  199. int ret;
  200. /*
  201. * Check if something changed. If yes, cleanup the current resources
  202. * and reallocate everything.
  203. */
  204. if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
  205. dm_cleanup_zoned_dev(md);
  206. if (md->nr_zones)
  207. return 0;
  208. /*
  209. * Scan all zones to initialize everything. Ensure that all vmalloc
  210. * operations in this context are done as if GFP_NOIO was specified.
  211. */
  212. noio_flag = memalloc_noio_save();
  213. ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
  214. dm_zone_revalidate_cb, md);
  215. memalloc_noio_restore(noio_flag);
  216. if (ret < 0)
  217. goto err;
  218. if (ret != disk->nr_zones) {
  219. ret = -EIO;
  220. goto err;
  221. }
  222. md->nr_zones = disk->nr_zones;
  223. return 0;
  224. err:
  225. DMERR("Revalidate zones failed %d", ret);
  226. dm_cleanup_zoned_dev(md);
  227. return ret;
  228. }
  229. static int device_not_zone_append_capable(struct dm_target *ti,
  230. struct dm_dev *dev, sector_t start,
  231. sector_t len, void *data)
  232. {
  233. return !bdev_is_zoned(dev->bdev);
  234. }
  235. static bool dm_table_supports_zone_append(struct dm_table *t)
  236. {
  237. for (unsigned int i = 0; i < t->num_targets; i++) {
  238. struct dm_target *ti = dm_table_get_target(t, i);
  239. if (ti->emulate_zone_append)
  240. return false;
  241. if (!ti->type->iterate_devices ||
  242. ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
  243. return false;
  244. }
  245. return true;
  246. }
  247. int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
  248. {
  249. struct mapped_device *md = t->md;
  250. /*
  251. * For a zoned target, the number of zones should be updated for the
  252. * correct value to be exposed in sysfs queue/nr_zones.
  253. */
  254. WARN_ON_ONCE(queue_is_mq(q));
  255. md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
  256. /* Check if zone append is natively supported */
  257. if (dm_table_supports_zone_append(t)) {
  258. clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
  259. dm_cleanup_zoned_dev(md);
  260. return 0;
  261. }
  262. /*
  263. * Mark the mapped device as needing zone append emulation and
  264. * initialize the emulation resources once the capacity is set.
  265. */
  266. set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
  267. if (!get_capacity(md->disk))
  268. return 0;
  269. return dm_revalidate_zones(md, t);
  270. }
  271. static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
  272. void *data)
  273. {
  274. unsigned int *wp_offset = data;
  275. *wp_offset = dm_get_zone_wp_offset(zone);
  276. return 0;
  277. }
  278. static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
  279. unsigned int *wp_ofst)
  280. {
  281. sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
  282. unsigned int noio_flag;
  283. struct dm_table *t;
  284. int srcu_idx, ret;
  285. t = dm_get_live_table(md, &srcu_idx);
  286. if (!t)
  287. return -EIO;
  288. /*
  289. * Ensure that all memory allocations in this context are done as if
  290. * GFP_NOIO was specified.
  291. */
  292. noio_flag = memalloc_noio_save();
  293. ret = dm_blk_do_report_zones(md, t, sector, 1,
  294. dm_update_zone_wp_offset_cb, wp_ofst);
  295. memalloc_noio_restore(noio_flag);
  296. dm_put_live_table(md, srcu_idx);
  297. if (ret != 1)
  298. return -EIO;
  299. return 0;
  300. }
  301. struct orig_bio_details {
  302. enum req_op op;
  303. unsigned int nr_sectors;
  304. };
  305. /*
  306. * First phase of BIO mapping for targets with zone append emulation:
  307. * check all BIO that change a zone writer pointer and change zone
  308. * append operations into regular write operations.
  309. */
  310. static bool dm_zone_map_bio_begin(struct mapped_device *md,
  311. unsigned int zno, struct bio *clone)
  312. {
  313. sector_t zsectors = bdev_zone_sectors(md->disk->part0);
  314. unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
  315. /*
  316. * If the target zone is in an error state, recover by inspecting the
  317. * zone to get its current write pointer position. Note that since the
  318. * target zone is already locked, a BIO issuing context should never
  319. * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
  320. */
  321. if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
  322. if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
  323. return false;
  324. WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
  325. }
  326. switch (bio_op(clone)) {
  327. case REQ_OP_ZONE_RESET:
  328. case REQ_OP_ZONE_FINISH:
  329. return true;
  330. case REQ_OP_WRITE_ZEROES:
  331. case REQ_OP_WRITE:
  332. /* Writes must be aligned to the zone write pointer */
  333. if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
  334. return false;
  335. break;
  336. case REQ_OP_ZONE_APPEND:
  337. /*
  338. * Change zone append operations into a non-mergeable regular
  339. * writes directed at the current write pointer position of the
  340. * target zone.
  341. */
  342. clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
  343. (clone->bi_opf & (~REQ_OP_MASK));
  344. clone->bi_iter.bi_sector += zwp_offset;
  345. break;
  346. default:
  347. DMWARN_LIMIT("Invalid BIO operation");
  348. return false;
  349. }
  350. /* Cannot write to a full zone */
  351. if (zwp_offset >= zsectors)
  352. return false;
  353. return true;
  354. }
  355. /*
  356. * Second phase of BIO mapping for targets with zone append emulation:
  357. * update the zone write pointer offset array to account for the additional
  358. * data written to a zone. Note that at this point, the remapped clone BIO
  359. * may already have completed, so we do not touch it.
  360. */
  361. static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
  362. struct orig_bio_details *orig_bio_details,
  363. unsigned int nr_sectors)
  364. {
  365. unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
  366. /* The clone BIO may already have been completed and failed */
  367. if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
  368. return BLK_STS_IOERR;
  369. /* Update the zone wp offset */
  370. switch (orig_bio_details->op) {
  371. case REQ_OP_ZONE_RESET:
  372. WRITE_ONCE(md->zwp_offset[zno], 0);
  373. return BLK_STS_OK;
  374. case REQ_OP_ZONE_FINISH:
  375. WRITE_ONCE(md->zwp_offset[zno],
  376. bdev_zone_sectors(md->disk->part0));
  377. return BLK_STS_OK;
  378. case REQ_OP_WRITE_ZEROES:
  379. case REQ_OP_WRITE:
  380. WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
  381. return BLK_STS_OK;
  382. case REQ_OP_ZONE_APPEND:
  383. /*
  384. * Check that the target did not truncate the write operation
  385. * emulating a zone append.
  386. */
  387. if (nr_sectors != orig_bio_details->nr_sectors) {
  388. DMWARN_LIMIT("Truncated write for zone append");
  389. return BLK_STS_IOERR;
  390. }
  391. WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
  392. return BLK_STS_OK;
  393. default:
  394. DMWARN_LIMIT("Invalid BIO operation");
  395. return BLK_STS_IOERR;
  396. }
  397. }
  398. static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
  399. struct bio *clone)
  400. {
  401. if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
  402. return;
  403. wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
  404. bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
  405. }
  406. static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
  407. struct bio *clone)
  408. {
  409. if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
  410. return;
  411. WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
  412. clear_bit_unlock(zno, disk->seq_zones_wlock);
  413. smp_mb__after_atomic();
  414. wake_up_bit(disk->seq_zones_wlock, zno);
  415. bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
  416. }
  417. static bool dm_need_zone_wp_tracking(struct bio *bio)
  418. {
  419. /*
  420. * Special processing is not needed for operations that do not need the
  421. * zone write lock, that is, all operations that target conventional
  422. * zones and all operations that do not modify directly a sequential
  423. * zone write pointer.
  424. */
  425. if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
  426. return false;
  427. switch (bio_op(bio)) {
  428. case REQ_OP_WRITE_ZEROES:
  429. case REQ_OP_WRITE:
  430. case REQ_OP_ZONE_RESET:
  431. case REQ_OP_ZONE_FINISH:
  432. case REQ_OP_ZONE_APPEND:
  433. return bio_zone_is_seq(bio);
  434. default:
  435. return false;
  436. }
  437. }
  438. /*
  439. * Special IO mapping for targets needing zone append emulation.
  440. */
  441. int dm_zone_map_bio(struct dm_target_io *tio)
  442. {
  443. struct dm_io *io = tio->io;
  444. struct dm_target *ti = tio->ti;
  445. struct mapped_device *md = io->md;
  446. struct bio *clone = &tio->clone;
  447. struct orig_bio_details orig_bio_details;
  448. unsigned int zno;
  449. blk_status_t sts;
  450. int r;
  451. /*
  452. * IOs that do not change a zone write pointer do not need
  453. * any additional special processing.
  454. */
  455. if (!dm_need_zone_wp_tracking(clone))
  456. return ti->type->map(ti, clone);
  457. /* Lock the target zone */
  458. zno = bio_zone_no(clone);
  459. dm_zone_lock(md->disk, zno, clone);
  460. orig_bio_details.nr_sectors = bio_sectors(clone);
  461. orig_bio_details.op = bio_op(clone);
  462. /*
  463. * Check that the bio and the target zone write pointer offset are
  464. * both valid, and if the bio is a zone append, remap it to a write.
  465. */
  466. if (!dm_zone_map_bio_begin(md, zno, clone)) {
  467. dm_zone_unlock(md->disk, zno, clone);
  468. return DM_MAPIO_KILL;
  469. }
  470. /* Let the target do its work */
  471. r = ti->type->map(ti, clone);
  472. switch (r) {
  473. case DM_MAPIO_SUBMITTED:
  474. /*
  475. * The target submitted the clone BIO. The target zone will
  476. * be unlocked on completion of the clone.
  477. */
  478. sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
  479. *tio->len_ptr);
  480. break;
  481. case DM_MAPIO_REMAPPED:
  482. /*
  483. * The target only remapped the clone BIO. In case of error,
  484. * unlock the target zone here as the clone will not be
  485. * submitted.
  486. */
  487. sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
  488. *tio->len_ptr);
  489. if (sts != BLK_STS_OK)
  490. dm_zone_unlock(md->disk, zno, clone);
  491. break;
  492. case DM_MAPIO_REQUEUE:
  493. case DM_MAPIO_KILL:
  494. default:
  495. dm_zone_unlock(md->disk, zno, clone);
  496. sts = BLK_STS_IOERR;
  497. break;
  498. }
  499. if (sts != BLK_STS_OK)
  500. return DM_MAPIO_KILL;
  501. return r;
  502. }
  503. /*
  504. * IO completion callback called from clone_endio().
  505. */
  506. void dm_zone_endio(struct dm_io *io, struct bio *clone)
  507. {
  508. struct mapped_device *md = io->md;
  509. struct gendisk *disk = md->disk;
  510. struct bio *orig_bio = io->orig_bio;
  511. unsigned int zwp_offset;
  512. unsigned int zno;
  513. /*
  514. * For targets that do not emulate zone append, we only need to
  515. * handle native zone-append bios.
  516. */
  517. if (!dm_emulate_zone_append(md)) {
  518. /*
  519. * Get the offset within the zone of the written sector
  520. * and add that to the original bio sector position.
  521. */
  522. if (clone->bi_status == BLK_STS_OK &&
  523. bio_op(clone) == REQ_OP_ZONE_APPEND) {
  524. sector_t mask =
  525. (sector_t)bdev_zone_sectors(disk->part0) - 1;
  526. orig_bio->bi_iter.bi_sector +=
  527. clone->bi_iter.bi_sector & mask;
  528. }
  529. return;
  530. }
  531. /*
  532. * For targets that do emulate zone append, if the clone BIO does not
  533. * own the target zone write lock, we have nothing to do.
  534. */
  535. if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
  536. return;
  537. zno = bio_zone_no(orig_bio);
  538. if (clone->bi_status != BLK_STS_OK) {
  539. /*
  540. * BIOs that modify a zone write pointer may leave the zone
  541. * in an unknown state in case of failure (e.g. the write
  542. * pointer was only partially advanced). In this case, set
  543. * the target zone write pointer as invalid unless it is
  544. * already being updated.
  545. */
  546. WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
  547. } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
  548. /*
  549. * Get the written sector for zone append operation that were
  550. * emulated using regular write operations.
  551. */
  552. zwp_offset = READ_ONCE(md->zwp_offset[zno]);
  553. if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
  554. WRITE_ONCE(md->zwp_offset[zno],
  555. DM_ZONE_INVALID_WP_OFST);
  556. else
  557. orig_bio->bi_iter.bi_sector +=
  558. zwp_offset - bio_sectors(orig_bio);
  559. }
  560. dm_zone_unlock(disk, zno, clone);
  561. }