123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * Zoned block device handling
- *
- * Copyright (c) 2015, Hannes Reinecke
- * Copyright (c) 2015, SUSE Linux GmbH
- *
- * Copyright (c) 2016, Damien Le Moal
- * Copyright (c) 2016, Western Digital
- */
- #include <linux/kernel.h>
- #include <linux/module.h>
- #include <linux/rbtree.h>
- #include <linux/blkdev.h>
- #include <linux/blk-mq.h>
- #include <linux/mm.h>
- #include <linux/vmalloc.h>
- #include <linux/sched/mm.h>
- #include "blk.h"
- #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
- static const char *const zone_cond_name[] = {
- ZONE_COND_NAME(NOT_WP),
- ZONE_COND_NAME(EMPTY),
- ZONE_COND_NAME(IMP_OPEN),
- ZONE_COND_NAME(EXP_OPEN),
- ZONE_COND_NAME(CLOSED),
- ZONE_COND_NAME(READONLY),
- ZONE_COND_NAME(FULL),
- ZONE_COND_NAME(OFFLINE),
- };
- #undef ZONE_COND_NAME
- /**
- * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
- * @zone_cond: BLK_ZONE_COND_XXX.
- *
- * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
- * into string format. Useful in the debugging and tracing zone conditions. For
- * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
- */
- const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
- {
- static const char *zone_cond_str = "UNKNOWN";
- if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
- zone_cond_str = zone_cond_name[zone_cond];
- return zone_cond_str;
- }
- EXPORT_SYMBOL_GPL(blk_zone_cond_str);
- /*
- * Return true if a request is a write requests that needs zone write locking.
- */
- bool blk_req_needs_zone_write_lock(struct request *rq)
- {
- if (blk_rq_is_passthrough(rq))
- return false;
- if (!rq->q->disk->seq_zones_wlock)
- return false;
- if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq)))
- return blk_rq_zone_is_seq(rq);
- return false;
- }
- EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
- bool blk_req_zone_write_trylock(struct request *rq)
- {
- unsigned int zno = blk_rq_zone_no(rq);
- if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
- return false;
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
- return true;
- }
- EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
- void __blk_req_zone_write_lock(struct request *rq)
- {
- if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock)))
- return;
- WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
- rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
- }
- EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
- void __blk_req_zone_write_unlock(struct request *rq)
- {
- rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->disk->seq_zones_wlock)
- WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->disk->seq_zones_wlock));
- }
- EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
- /**
- * bdev_nr_zones - Get number of zones
- * @bdev: Target device
- *
- * Return the total number of zones of a zoned block device. For a block
- * device without zone capabilities, the number of zones is always 0.
- */
- unsigned int bdev_nr_zones(struct block_device *bdev)
- {
- sector_t zone_sectors = bdev_zone_sectors(bdev);
- if (!bdev_is_zoned(bdev))
- return 0;
- return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
- ilog2(zone_sectors);
- }
- EXPORT_SYMBOL_GPL(bdev_nr_zones);
- /**
- * blkdev_report_zones - Get zones information
- * @bdev: Target block device
- * @sector: Sector from which to report zones
- * @nr_zones: Maximum number of zones to report
- * @cb: Callback function called for each reported zone
- * @data: Private data for the callback
- *
- * Description:
- * Get zone information starting from the zone containing @sector for at most
- * @nr_zones, and call @cb for each zone reported by the device.
- * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
- * constant can be passed to @nr_zones.
- * Returns the number of zones reported by the device, or a negative errno
- * value in case of failure.
- *
- * Note: The caller must use memalloc_noXX_save/restore() calls to control
- * memory allocations done within this function.
- */
- int blkdev_report_zones(struct block_device *bdev, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
- {
- struct gendisk *disk = bdev->bd_disk;
- sector_t capacity = get_capacity(disk);
- if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
- return -EOPNOTSUPP;
- if (!nr_zones || sector >= capacity)
- return 0;
- return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
- }
- EXPORT_SYMBOL_GPL(blkdev_report_zones);
- static inline unsigned long *blk_alloc_zone_bitmap(int node,
- unsigned int nr_zones)
- {
- return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
- GFP_NOIO, node);
- }
- static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
- {
- /*
- * For an all-zones reset, ignore conventional, empty, read-only
- * and offline zones.
- */
- switch (zone->cond) {
- case BLK_ZONE_COND_NOT_WP:
- case BLK_ZONE_COND_EMPTY:
- case BLK_ZONE_COND_READONLY:
- case BLK_ZONE_COND_OFFLINE:
- return 0;
- default:
- set_bit(idx, (unsigned long *)data);
- return 0;
- }
- }
- static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
- gfp_t gfp_mask)
- {
- struct gendisk *disk = bdev->bd_disk;
- sector_t capacity = bdev_nr_sectors(bdev);
- sector_t zone_sectors = bdev_zone_sectors(bdev);
- unsigned long *need_reset;
- struct bio *bio = NULL;
- sector_t sector = 0;
- int ret;
- need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
- if (!need_reset)
- return -ENOMEM;
- ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
- blk_zone_need_reset_cb, need_reset);
- if (ret < 0)
- goto out_free_need_reset;
- ret = 0;
- while (sector < capacity) {
- if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
- sector += zone_sectors;
- continue;
- }
- bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
- gfp_mask);
- bio->bi_iter.bi_sector = sector;
- sector += zone_sectors;
- /* This may take a while, so be nice to others */
- cond_resched();
- }
- if (bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- }
- out_free_need_reset:
- kfree(need_reset);
- return ret;
- }
- static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
- {
- struct bio bio;
- bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
- return submit_bio_wait(&bio);
- }
- /**
- * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
- * @bdev: Target block device
- * @op: Operation to be performed on the zones
- * @sector: Start sector of the first zone to operate on
- * @nr_sectors: Number of sectors, should be at least the length of one zone and
- * must be zone size aligned.
- * @gfp_mask: Memory allocation flags (for bio_alloc)
- *
- * Description:
- * Perform the specified operation on the range of zones specified by
- * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
- * is valid, but the specified range should not contain conventional zones.
- * The operation to execute on each zone can be a zone reset, open, close
- * or finish request.
- */
- int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
- sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
- {
- struct request_queue *q = bdev_get_queue(bdev);
- sector_t zone_sectors = bdev_zone_sectors(bdev);
- sector_t capacity = bdev_nr_sectors(bdev);
- sector_t end_sector = sector + nr_sectors;
- struct bio *bio = NULL;
- int ret = 0;
- if (!bdev_is_zoned(bdev))
- return -EOPNOTSUPP;
- if (bdev_read_only(bdev))
- return -EPERM;
- if (!op_is_zone_mgmt(op))
- return -EOPNOTSUPP;
- if (end_sector <= sector || end_sector > capacity)
- /* Out of range */
- return -EINVAL;
- /* Check alignment (handle eventual smaller last zone) */
- if (sector & (zone_sectors - 1))
- return -EINVAL;
- if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
- return -EINVAL;
- /*
- * In the case of a zone reset operation over all zones,
- * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
- * command. For other devices, we emulate this command behavior by
- * identifying the zones needing a reset.
- */
- if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
- if (!blk_queue_zone_resetall(q))
- return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
- return blkdev_zone_reset_all(bdev, gfp_mask);
- }
- while (sector < end_sector) {
- bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- sector += zone_sectors;
- /* This may take a while, so be nice to others */
- cond_resched();
- }
- ret = submit_bio_wait(bio);
- bio_put(bio);
- return ret;
- }
- EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
- struct zone_report_args {
- struct blk_zone __user *zones;
- };
- static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
- void *data)
- {
- struct zone_report_args *args = data;
- if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
- return -EFAULT;
- return 0;
- }
- /*
- * BLKREPORTZONE ioctl processing.
- * Called from blkdev_ioctl.
- */
- int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
- {
- void __user *argp = (void __user *)arg;
- struct zone_report_args args;
- struct request_queue *q;
- struct blk_zone_report rep;
- int ret;
- if (!argp)
- return -EINVAL;
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
- if (!bdev_is_zoned(bdev))
- return -ENOTTY;
- if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
- return -EFAULT;
- if (!rep.nr_zones)
- return -EINVAL;
- args.zones = argp + sizeof(struct blk_zone_report);
- ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
- blkdev_copy_zone_to_user, &args);
- if (ret < 0)
- return ret;
- rep.nr_zones = ret;
- rep.flags = BLK_ZONE_REP_CAPACITY;
- if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
- return -EFAULT;
- return 0;
- }
- static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode,
- const struct blk_zone_range *zrange)
- {
- loff_t start, end;
- if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
- zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
- /* Out of range */
- return -EINVAL;
- start = zrange->sector << SECTOR_SHIFT;
- end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
- return truncate_bdev_range(bdev, mode, start, end);
- }
- /*
- * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
- * Called from blkdev_ioctl.
- */
- int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
- {
- void __user *argp = (void __user *)arg;
- struct request_queue *q;
- struct blk_zone_range zrange;
- enum req_op op;
- int ret;
- if (!argp)
- return -EINVAL;
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
- if (!bdev_is_zoned(bdev))
- return -ENOTTY;
- if (!(mode & FMODE_WRITE))
- return -EBADF;
- if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
- return -EFAULT;
- switch (cmd) {
- case BLKRESETZONE:
- op = REQ_OP_ZONE_RESET;
- /* Invalidate the page cache, including dirty pages. */
- filemap_invalidate_lock(bdev->bd_inode->i_mapping);
- ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
- if (ret)
- goto fail;
- break;
- case BLKOPENZONE:
- op = REQ_OP_ZONE_OPEN;
- break;
- case BLKCLOSEZONE:
- op = REQ_OP_ZONE_CLOSE;
- break;
- case BLKFINISHZONE:
- op = REQ_OP_ZONE_FINISH;
- break;
- default:
- return -ENOTTY;
- }
- ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
- GFP_KERNEL);
- fail:
- if (cmd == BLKRESETZONE)
- filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
- return ret;
- }
- void disk_free_zone_bitmaps(struct gendisk *disk)
- {
- kfree(disk->conv_zones_bitmap);
- disk->conv_zones_bitmap = NULL;
- kfree(disk->seq_zones_wlock);
- disk->seq_zones_wlock = NULL;
- }
- struct blk_revalidate_zone_args {
- struct gendisk *disk;
- unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
- unsigned int nr_zones;
- sector_t zone_sectors;
- sector_t sector;
- };
- /*
- * Helper function to check the validity of zones of a zoned block device.
- */
- static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
- {
- struct blk_revalidate_zone_args *args = data;
- struct gendisk *disk = args->disk;
- struct request_queue *q = disk->queue;
- sector_t capacity = get_capacity(disk);
- /*
- * All zones must have the same size, with the exception on an eventual
- * smaller last zone.
- */
- if (zone->start == 0) {
- if (zone->len == 0 || !is_power_of_2(zone->len)) {
- pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
- disk->disk_name, zone->len);
- return -ENODEV;
- }
- args->zone_sectors = zone->len;
- args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
- } else if (zone->start + args->zone_sectors < capacity) {
- if (zone->len != args->zone_sectors) {
- pr_warn("%s: Invalid zoned device with non constant zone size\n",
- disk->disk_name);
- return -ENODEV;
- }
- } else {
- if (zone->len > args->zone_sectors) {
- pr_warn("%s: Invalid zoned device with larger last zone size\n",
- disk->disk_name);
- return -ENODEV;
- }
- }
- /* Check for holes in the zone report */
- if (zone->start != args->sector) {
- pr_warn("%s: Zone gap at sectors %llu..%llu\n",
- disk->disk_name, args->sector, zone->start);
- return -ENODEV;
- }
- /* Check zone type */
- switch (zone->type) {
- case BLK_ZONE_TYPE_CONVENTIONAL:
- if (!args->conv_zones_bitmap) {
- args->conv_zones_bitmap =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->conv_zones_bitmap)
- return -ENOMEM;
- }
- set_bit(idx, args->conv_zones_bitmap);
- break;
- case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
- if (!args->seq_zones_wlock) {
- args->seq_zones_wlock =
- blk_alloc_zone_bitmap(q->node, args->nr_zones);
- if (!args->seq_zones_wlock)
- return -ENOMEM;
- }
- break;
- default:
- pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
- disk->disk_name, (int)zone->type, zone->start);
- return -ENODEV;
- }
- args->sector += zone->len;
- return 0;
- }
- /**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
- * @disk: Target disk
- * @update_driver_data: Callback to update driver data on the frozen disk
- *
- * Helper function for low-level device drivers to (re) allocate and initialize
- * a disk request queue zone bitmaps. This functions should normally be called
- * within the disk ->revalidate method for blk-mq based drivers. For BIO based
- * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
- * is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
- */
- int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk))
- {
- struct request_queue *q = disk->queue;
- struct blk_revalidate_zone_args args = {
- .disk = disk,
- };
- unsigned int noio_flag;
- int ret;
- if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
- return -EIO;
- if (WARN_ON_ONCE(!queue_is_mq(q)))
- return -EIO;
- if (!get_capacity(disk))
- return -EIO;
- /*
- * Ensure that all memory allocations in this context are done as if
- * GFP_NOIO was specified.
- */
- noio_flag = memalloc_noio_save();
- ret = disk->fops->report_zones(disk, 0, UINT_MAX,
- blk_revalidate_zone_cb, &args);
- if (!ret) {
- pr_warn("%s: No zones reported\n", disk->disk_name);
- ret = -ENODEV;
- }
- memalloc_noio_restore(noio_flag);
- /*
- * If zones where reported, make sure that the entire disk capacity
- * has been checked.
- */
- if (ret > 0 && args.sector != get_capacity(disk)) {
- pr_warn("%s: Missing zones from sector %llu\n",
- disk->disk_name, args.sector);
- ret = -ENODEV;
- }
- /*
- * Install the new bitmaps and update nr_zones only once the queue is
- * stopped and all I/Os are completed (i.e. a scheduler is not
- * referencing the bitmaps).
- */
- blk_mq_freeze_queue(q);
- if (ret > 0) {
- blk_queue_chunk_sectors(q, args.zone_sectors);
- disk->nr_zones = args.nr_zones;
- swap(disk->seq_zones_wlock, args.seq_zones_wlock);
- swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
- if (update_driver_data)
- update_driver_data(disk);
- ret = 0;
- } else {
- pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- disk_free_zone_bitmaps(disk);
- }
- blk_mq_unfreeze_queue(q);
- kfree(args.seq_zones_wlock);
- kfree(args.conv_zones_bitmap);
- return ret;
- }
- EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
- void disk_clear_zone_settings(struct gendisk *disk)
- {
- struct request_queue *q = disk->queue;
- blk_mq_freeze_queue(q);
- disk_free_zone_bitmaps(disk);
- blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
- q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
- disk->nr_zones = 0;
- disk->max_open_zones = 0;
- disk->max_active_zones = 0;
- q->limits.chunk_sectors = 0;
- q->limits.zone_write_granularity = 0;
- q->limits.max_zone_append_sectors = 0;
- blk_mq_unfreeze_queue(q);
- }
|