multipath.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2017-2018 Christoph Hellwig.
  4. */
  5. #include <linux/backing-dev.h>
  6. #include <linux/moduleparam.h>
  7. #include <linux/vmalloc.h>
  8. #include <trace/events/block.h>
  9. #include "nvme.h"
  10. bool multipath = true;
  11. module_param(multipath, bool, 0444);
  12. MODULE_PARM_DESC(multipath,
  13. "turn on native support for multiple controllers per subsystem");
  14. static const char *nvme_iopolicy_names[] = {
  15. [NVME_IOPOLICY_NUMA] = "numa",
  16. [NVME_IOPOLICY_RR] = "round-robin",
  17. };
  18. static int iopolicy = NVME_IOPOLICY_NUMA;
  19. static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
  20. {
  21. if (!val)
  22. return -EINVAL;
  23. if (!strncmp(val, "numa", 4))
  24. iopolicy = NVME_IOPOLICY_NUMA;
  25. else if (!strncmp(val, "round-robin", 11))
  26. iopolicy = NVME_IOPOLICY_RR;
  27. else
  28. return -EINVAL;
  29. return 0;
  30. }
  31. static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
  32. {
  33. return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
  34. }
  35. module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
  36. &iopolicy, 0644);
  37. MODULE_PARM_DESC(iopolicy,
  38. "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
  39. void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
  40. {
  41. subsys->iopolicy = iopolicy;
  42. }
  43. void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
  44. {
  45. struct nvme_ns_head *h;
  46. lockdep_assert_held(&subsys->lock);
  47. list_for_each_entry(h, &subsys->nsheads, entry)
  48. if (h->disk)
  49. blk_mq_unfreeze_queue(h->disk->queue);
  50. }
  51. void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
  52. {
  53. struct nvme_ns_head *h;
  54. lockdep_assert_held(&subsys->lock);
  55. list_for_each_entry(h, &subsys->nsheads, entry)
  56. if (h->disk)
  57. blk_mq_freeze_queue_wait(h->disk->queue);
  58. }
  59. void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
  60. {
  61. struct nvme_ns_head *h;
  62. lockdep_assert_held(&subsys->lock);
  63. list_for_each_entry(h, &subsys->nsheads, entry)
  64. if (h->disk)
  65. blk_freeze_queue_start(h->disk->queue);
  66. }
  67. void nvme_failover_req(struct request *req)
  68. {
  69. struct nvme_ns *ns = req->q->queuedata;
  70. u16 status = nvme_req(req)->status & 0x7ff;
  71. unsigned long flags;
  72. struct bio *bio;
  73. nvme_mpath_clear_current_path(ns);
  74. /*
  75. * If we got back an ANA error, we know the controller is alive but not
  76. * ready to serve this namespace. Kick of a re-read of the ANA
  77. * information page, and just try any other available path for now.
  78. */
  79. if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
  80. set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  81. queue_work(nvme_wq, &ns->ctrl->ana_work);
  82. }
  83. spin_lock_irqsave(&ns->head->requeue_lock, flags);
  84. for (bio = req->bio; bio; bio = bio->bi_next) {
  85. bio_set_dev(bio, ns->head->disk->part0);
  86. if (bio->bi_opf & REQ_POLLED) {
  87. bio->bi_opf &= ~REQ_POLLED;
  88. bio->bi_cookie = BLK_QC_T_NONE;
  89. }
  90. }
  91. blk_steal_bios(&ns->head->requeue_list, req);
  92. spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  93. blk_mq_end_request(req, 0);
  94. kblockd_schedule_work(&ns->head->requeue_work);
  95. }
  96. void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
  97. {
  98. struct nvme_ns *ns;
  99. down_read(&ctrl->namespaces_rwsem);
  100. list_for_each_entry(ns, &ctrl->namespaces, list) {
  101. if (!ns->head->disk)
  102. continue;
  103. kblockd_schedule_work(&ns->head->requeue_work);
  104. if (ctrl->state == NVME_CTRL_LIVE)
  105. disk_uevent(ns->head->disk, KOBJ_CHANGE);
  106. }
  107. up_read(&ctrl->namespaces_rwsem);
  108. }
  109. static const char *nvme_ana_state_names[] = {
  110. [0] = "invalid state",
  111. [NVME_ANA_OPTIMIZED] = "optimized",
  112. [NVME_ANA_NONOPTIMIZED] = "non-optimized",
  113. [NVME_ANA_INACCESSIBLE] = "inaccessible",
  114. [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
  115. [NVME_ANA_CHANGE] = "change",
  116. };
  117. bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
  118. {
  119. struct nvme_ns_head *head = ns->head;
  120. bool changed = false;
  121. int node;
  122. if (!head)
  123. goto out;
  124. for_each_node(node) {
  125. if (ns == rcu_access_pointer(head->current_path[node])) {
  126. rcu_assign_pointer(head->current_path[node], NULL);
  127. changed = true;
  128. }
  129. }
  130. out:
  131. return changed;
  132. }
  133. void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
  134. {
  135. struct nvme_ns *ns;
  136. down_read(&ctrl->namespaces_rwsem);
  137. list_for_each_entry(ns, &ctrl->namespaces, list) {
  138. nvme_mpath_clear_current_path(ns);
  139. kblockd_schedule_work(&ns->head->requeue_work);
  140. }
  141. up_read(&ctrl->namespaces_rwsem);
  142. }
  143. void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
  144. {
  145. struct nvme_ns_head *head = ns->head;
  146. sector_t capacity = get_capacity(head->disk);
  147. int node;
  148. int srcu_idx;
  149. srcu_idx = srcu_read_lock(&head->srcu);
  150. list_for_each_entry_rcu(ns, &head->list, siblings) {
  151. if (capacity != get_capacity(ns->disk))
  152. clear_bit(NVME_NS_READY, &ns->flags);
  153. }
  154. srcu_read_unlock(&head->srcu, srcu_idx);
  155. for_each_node(node)
  156. rcu_assign_pointer(head->current_path[node], NULL);
  157. kblockd_schedule_work(&head->requeue_work);
  158. }
  159. static bool nvme_path_is_disabled(struct nvme_ns *ns)
  160. {
  161. /*
  162. * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
  163. * still be able to complete assuming that the controller is connected.
  164. * Otherwise it will fail immediately and return to the requeue list.
  165. */
  166. if (ns->ctrl->state != NVME_CTRL_LIVE &&
  167. ns->ctrl->state != NVME_CTRL_DELETING)
  168. return true;
  169. if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
  170. !test_bit(NVME_NS_READY, &ns->flags))
  171. return true;
  172. return false;
  173. }
  174. static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
  175. {
  176. int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
  177. struct nvme_ns *found = NULL, *fallback = NULL, *ns;
  178. list_for_each_entry_rcu(ns, &head->list, siblings) {
  179. if (nvme_path_is_disabled(ns))
  180. continue;
  181. if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
  182. distance = node_distance(node, ns->ctrl->numa_node);
  183. else
  184. distance = LOCAL_DISTANCE;
  185. switch (ns->ana_state) {
  186. case NVME_ANA_OPTIMIZED:
  187. if (distance < found_distance) {
  188. found_distance = distance;
  189. found = ns;
  190. }
  191. break;
  192. case NVME_ANA_NONOPTIMIZED:
  193. if (distance < fallback_distance) {
  194. fallback_distance = distance;
  195. fallback = ns;
  196. }
  197. break;
  198. default:
  199. break;
  200. }
  201. }
  202. if (!found)
  203. found = fallback;
  204. if (found)
  205. rcu_assign_pointer(head->current_path[node], found);
  206. return found;
  207. }
  208. static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
  209. struct nvme_ns *ns)
  210. {
  211. ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
  212. siblings);
  213. if (ns)
  214. return ns;
  215. return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
  216. }
  217. static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
  218. int node, struct nvme_ns *old)
  219. {
  220. struct nvme_ns *ns, *found = NULL;
  221. if (list_is_singular(&head->list)) {
  222. if (nvme_path_is_disabled(old))
  223. return NULL;
  224. return old;
  225. }
  226. for (ns = nvme_next_ns(head, old);
  227. ns && ns != old;
  228. ns = nvme_next_ns(head, ns)) {
  229. if (nvme_path_is_disabled(ns))
  230. continue;
  231. if (ns->ana_state == NVME_ANA_OPTIMIZED) {
  232. found = ns;
  233. goto out;
  234. }
  235. if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
  236. found = ns;
  237. }
  238. /*
  239. * The loop above skips the current path for round-robin semantics.
  240. * Fall back to the current path if either:
  241. * - no other optimized path found and current is optimized,
  242. * - no other usable path found and current is usable.
  243. */
  244. if (!nvme_path_is_disabled(old) &&
  245. (old->ana_state == NVME_ANA_OPTIMIZED ||
  246. (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
  247. return old;
  248. if (!found)
  249. return NULL;
  250. out:
  251. rcu_assign_pointer(head->current_path[node], found);
  252. return found;
  253. }
  254. static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
  255. {
  256. return ns->ctrl->state == NVME_CTRL_LIVE &&
  257. ns->ana_state == NVME_ANA_OPTIMIZED;
  258. }
  259. inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
  260. {
  261. int node = numa_node_id();
  262. struct nvme_ns *ns;
  263. ns = srcu_dereference(head->current_path[node], &head->srcu);
  264. if (unlikely(!ns))
  265. return __nvme_find_path(head, node);
  266. if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
  267. return nvme_round_robin_path(head, node, ns);
  268. if (unlikely(!nvme_path_is_optimized(ns)))
  269. return __nvme_find_path(head, node);
  270. return ns;
  271. }
  272. static bool nvme_available_path(struct nvme_ns_head *head)
  273. {
  274. struct nvme_ns *ns;
  275. list_for_each_entry_rcu(ns, &head->list, siblings) {
  276. if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
  277. continue;
  278. switch (ns->ctrl->state) {
  279. case NVME_CTRL_LIVE:
  280. case NVME_CTRL_RESETTING:
  281. case NVME_CTRL_CONNECTING:
  282. /* fallthru */
  283. return true;
  284. default:
  285. break;
  286. }
  287. }
  288. return false;
  289. }
  290. static void nvme_ns_head_submit_bio(struct bio *bio)
  291. {
  292. struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
  293. struct device *dev = disk_to_dev(head->disk);
  294. struct nvme_ns *ns;
  295. int srcu_idx;
  296. /*
  297. * The namespace might be going away and the bio might be moved to a
  298. * different queue via blk_steal_bios(), so we need to use the bio_split
  299. * pool from the original queue to allocate the bvecs from.
  300. */
  301. bio = bio_split_to_limits(bio);
  302. if (!bio)
  303. return;
  304. srcu_idx = srcu_read_lock(&head->srcu);
  305. ns = nvme_find_path(head);
  306. if (likely(ns)) {
  307. bio_set_dev(bio, ns->disk->part0);
  308. bio->bi_opf |= REQ_NVME_MPATH;
  309. trace_block_bio_remap(bio, disk_devt(ns->head->disk),
  310. bio->bi_iter.bi_sector);
  311. submit_bio_noacct(bio);
  312. } else if (nvme_available_path(head)) {
  313. dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
  314. spin_lock_irq(&head->requeue_lock);
  315. bio_list_add(&head->requeue_list, bio);
  316. spin_unlock_irq(&head->requeue_lock);
  317. } else {
  318. dev_warn_ratelimited(dev, "no available path - failing I/O\n");
  319. bio_io_error(bio);
  320. }
  321. srcu_read_unlock(&head->srcu, srcu_idx);
  322. }
  323. static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
  324. {
  325. if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
  326. return -ENXIO;
  327. return 0;
  328. }
  329. static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
  330. {
  331. nvme_put_ns_head(disk->private_data);
  332. }
  333. #ifdef CONFIG_BLK_DEV_ZONED
  334. static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
  335. unsigned int nr_zones, report_zones_cb cb, void *data)
  336. {
  337. struct nvme_ns_head *head = disk->private_data;
  338. struct nvme_ns *ns;
  339. int srcu_idx, ret = -EWOULDBLOCK;
  340. srcu_idx = srcu_read_lock(&head->srcu);
  341. ns = nvme_find_path(head);
  342. if (ns)
  343. ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
  344. srcu_read_unlock(&head->srcu, srcu_idx);
  345. return ret;
  346. }
  347. #else
  348. #define nvme_ns_head_report_zones NULL
  349. #endif /* CONFIG_BLK_DEV_ZONED */
  350. const struct block_device_operations nvme_ns_head_ops = {
  351. .owner = THIS_MODULE,
  352. .submit_bio = nvme_ns_head_submit_bio,
  353. .open = nvme_ns_head_open,
  354. .release = nvme_ns_head_release,
  355. .ioctl = nvme_ns_head_ioctl,
  356. .compat_ioctl = blkdev_compat_ptr_ioctl,
  357. .getgeo = nvme_getgeo,
  358. .report_zones = nvme_ns_head_report_zones,
  359. .pr_ops = &nvme_pr_ops,
  360. };
  361. static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
  362. {
  363. return container_of(cdev, struct nvme_ns_head, cdev);
  364. }
  365. static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
  366. {
  367. if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
  368. return -ENXIO;
  369. return 0;
  370. }
  371. static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
  372. {
  373. nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
  374. return 0;
  375. }
  376. static const struct file_operations nvme_ns_head_chr_fops = {
  377. .owner = THIS_MODULE,
  378. .open = nvme_ns_head_chr_open,
  379. .release = nvme_ns_head_chr_release,
  380. .unlocked_ioctl = nvme_ns_head_chr_ioctl,
  381. .compat_ioctl = compat_ptr_ioctl,
  382. .uring_cmd = nvme_ns_head_chr_uring_cmd,
  383. .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
  384. };
  385. static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
  386. {
  387. int ret;
  388. head->cdev_device.parent = &head->subsys->dev;
  389. ret = dev_set_name(&head->cdev_device, "ng%dn%d",
  390. head->subsys->instance, head->instance);
  391. if (ret)
  392. return ret;
  393. ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
  394. &nvme_ns_head_chr_fops, THIS_MODULE);
  395. return ret;
  396. }
  397. static void nvme_requeue_work(struct work_struct *work)
  398. {
  399. struct nvme_ns_head *head =
  400. container_of(work, struct nvme_ns_head, requeue_work);
  401. struct bio *bio, *next;
  402. spin_lock_irq(&head->requeue_lock);
  403. next = bio_list_get(&head->requeue_list);
  404. spin_unlock_irq(&head->requeue_lock);
  405. while ((bio = next) != NULL) {
  406. next = bio->bi_next;
  407. bio->bi_next = NULL;
  408. submit_bio_noacct(bio);
  409. }
  410. }
  411. int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
  412. {
  413. bool vwc = false;
  414. mutex_init(&head->lock);
  415. bio_list_init(&head->requeue_list);
  416. spin_lock_init(&head->requeue_lock);
  417. INIT_WORK(&head->requeue_work, nvme_requeue_work);
  418. /*
  419. * Add a multipath node if the subsystems supports multiple controllers.
  420. * We also do this for private namespaces as the namespace sharing flag
  421. * could change after a rescan.
  422. */
  423. if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
  424. !nvme_is_unique_nsid(ctrl, head) || !multipath)
  425. return 0;
  426. head->disk = blk_alloc_disk(ctrl->numa_node);
  427. if (!head->disk)
  428. return -ENOMEM;
  429. head->disk->fops = &nvme_ns_head_ops;
  430. head->disk->private_data = head;
  431. sprintf(head->disk->disk_name, "nvme%dn%d",
  432. ctrl->subsys->instance, head->instance);
  433. blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
  434. blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
  435. /*
  436. * This assumes all controllers that refer to a namespace either
  437. * support poll queues or not. That is not a strict guarantee,
  438. * but if the assumption is wrong the effect is only suboptimal
  439. * performance but not correctness problem.
  440. */
  441. if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
  442. ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
  443. blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
  444. /* set to a default value of 512 until the disk is validated */
  445. blk_queue_logical_block_size(head->disk->queue, 512);
  446. blk_set_stacking_limits(&head->disk->queue->limits);
  447. blk_queue_dma_alignment(head->disk->queue, 3);
  448. /* we need to propagate up the VMC settings */
  449. if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
  450. vwc = true;
  451. blk_queue_write_cache(head->disk->queue, vwc, vwc);
  452. return 0;
  453. }
  454. static void nvme_mpath_set_live(struct nvme_ns *ns)
  455. {
  456. struct nvme_ns_head *head = ns->head;
  457. int rc;
  458. if (!head->disk)
  459. return;
  460. /*
  461. * test_and_set_bit() is used because it is protecting against two nvme
  462. * paths simultaneously calling device_add_disk() on the same namespace
  463. * head.
  464. */
  465. if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
  466. rc = device_add_disk(&head->subsys->dev, head->disk,
  467. nvme_ns_id_attr_groups);
  468. if (rc) {
  469. clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
  470. return;
  471. }
  472. nvme_add_ns_head_cdev(head);
  473. }
  474. mutex_lock(&head->lock);
  475. if (nvme_path_is_optimized(ns)) {
  476. int node, srcu_idx;
  477. srcu_idx = srcu_read_lock(&head->srcu);
  478. for_each_node(node)
  479. __nvme_find_path(head, node);
  480. srcu_read_unlock(&head->srcu, srcu_idx);
  481. }
  482. mutex_unlock(&head->lock);
  483. synchronize_srcu(&head->srcu);
  484. kblockd_schedule_work(&head->requeue_work);
  485. }
  486. static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
  487. int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
  488. void *))
  489. {
  490. void *base = ctrl->ana_log_buf;
  491. size_t offset = sizeof(struct nvme_ana_rsp_hdr);
  492. int error, i;
  493. lockdep_assert_held(&ctrl->ana_lock);
  494. for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
  495. struct nvme_ana_group_desc *desc = base + offset;
  496. u32 nr_nsids;
  497. size_t nsid_buf_size;
  498. if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
  499. return -EINVAL;
  500. nr_nsids = le32_to_cpu(desc->nnsids);
  501. nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
  502. if (WARN_ON_ONCE(desc->grpid == 0))
  503. return -EINVAL;
  504. if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
  505. return -EINVAL;
  506. if (WARN_ON_ONCE(desc->state == 0))
  507. return -EINVAL;
  508. if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
  509. return -EINVAL;
  510. offset += sizeof(*desc);
  511. if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
  512. return -EINVAL;
  513. error = cb(ctrl, desc, data);
  514. if (error)
  515. return error;
  516. offset += nsid_buf_size;
  517. }
  518. return 0;
  519. }
  520. static inline bool nvme_state_is_live(enum nvme_ana_state state)
  521. {
  522. return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
  523. }
  524. static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
  525. struct nvme_ns *ns)
  526. {
  527. ns->ana_grpid = le32_to_cpu(desc->grpid);
  528. ns->ana_state = desc->state;
  529. clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
  530. /*
  531. * nvme_mpath_set_live() will trigger I/O to the multipath path device
  532. * and in turn to this path device. However we cannot accept this I/O
  533. * if the controller is not live. This may deadlock if called from
  534. * nvme_mpath_init_identify() and the ctrl will never complete
  535. * initialization, preventing I/O from completing. For this case we
  536. * will reprocess the ANA log page in nvme_mpath_update() once the
  537. * controller is ready.
  538. */
  539. if (nvme_state_is_live(ns->ana_state) &&
  540. ns->ctrl->state == NVME_CTRL_LIVE)
  541. nvme_mpath_set_live(ns);
  542. }
  543. static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
  544. struct nvme_ana_group_desc *desc, void *data)
  545. {
  546. u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
  547. unsigned *nr_change_groups = data;
  548. struct nvme_ns *ns;
  549. dev_dbg(ctrl->device, "ANA group %d: %s.\n",
  550. le32_to_cpu(desc->grpid),
  551. nvme_ana_state_names[desc->state]);
  552. if (desc->state == NVME_ANA_CHANGE)
  553. (*nr_change_groups)++;
  554. if (!nr_nsids)
  555. return 0;
  556. down_read(&ctrl->namespaces_rwsem);
  557. list_for_each_entry(ns, &ctrl->namespaces, list) {
  558. unsigned nsid;
  559. again:
  560. nsid = le32_to_cpu(desc->nsids[n]);
  561. if (ns->head->ns_id < nsid)
  562. continue;
  563. if (ns->head->ns_id == nsid)
  564. nvme_update_ns_ana_state(desc, ns);
  565. if (++n == nr_nsids)
  566. break;
  567. if (ns->head->ns_id > nsid)
  568. goto again;
  569. }
  570. up_read(&ctrl->namespaces_rwsem);
  571. return 0;
  572. }
  573. static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
  574. {
  575. u32 nr_change_groups = 0;
  576. int error;
  577. mutex_lock(&ctrl->ana_lock);
  578. error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
  579. ctrl->ana_log_buf, ctrl->ana_log_size, 0);
  580. if (error) {
  581. dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
  582. goto out_unlock;
  583. }
  584. error = nvme_parse_ana_log(ctrl, &nr_change_groups,
  585. nvme_update_ana_state);
  586. if (error)
  587. goto out_unlock;
  588. /*
  589. * In theory we should have an ANATT timer per group as they might enter
  590. * the change state at different times. But that is a lot of overhead
  591. * just to protect against a target that keeps entering new changes
  592. * states while never finishing previous ones. But we'll still
  593. * eventually time out once all groups are in change state, so this
  594. * isn't a big deal.
  595. *
  596. * We also double the ANATT value to provide some slack for transports
  597. * or AEN processing overhead.
  598. */
  599. if (nr_change_groups)
  600. mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
  601. else
  602. del_timer_sync(&ctrl->anatt_timer);
  603. out_unlock:
  604. mutex_unlock(&ctrl->ana_lock);
  605. return error;
  606. }
  607. static void nvme_ana_work(struct work_struct *work)
  608. {
  609. struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
  610. if (ctrl->state != NVME_CTRL_LIVE)
  611. return;
  612. nvme_read_ana_log(ctrl);
  613. }
  614. void nvme_mpath_update(struct nvme_ctrl *ctrl)
  615. {
  616. u32 nr_change_groups = 0;
  617. if (!ctrl->ana_log_buf)
  618. return;
  619. mutex_lock(&ctrl->ana_lock);
  620. nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
  621. mutex_unlock(&ctrl->ana_lock);
  622. }
  623. static void nvme_anatt_timeout(struct timer_list *t)
  624. {
  625. struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
  626. dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
  627. nvme_reset_ctrl(ctrl);
  628. }
  629. void nvme_mpath_stop(struct nvme_ctrl *ctrl)
  630. {
  631. if (!nvme_ctrl_use_ana(ctrl))
  632. return;
  633. del_timer_sync(&ctrl->anatt_timer);
  634. cancel_work_sync(&ctrl->ana_work);
  635. }
  636. #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
  637. struct device_attribute subsys_attr_##_name = \
  638. __ATTR(_name, _mode, _show, _store)
  639. static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
  640. struct device_attribute *attr, char *buf)
  641. {
  642. struct nvme_subsystem *subsys =
  643. container_of(dev, struct nvme_subsystem, dev);
  644. return sysfs_emit(buf, "%s\n",
  645. nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
  646. }
  647. static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
  648. struct device_attribute *attr, const char *buf, size_t count)
  649. {
  650. struct nvme_subsystem *subsys =
  651. container_of(dev, struct nvme_subsystem, dev);
  652. int i;
  653. for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
  654. if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
  655. WRITE_ONCE(subsys->iopolicy, i);
  656. return count;
  657. }
  658. }
  659. return -EINVAL;
  660. }
  661. SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
  662. nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
  663. static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
  664. char *buf)
  665. {
  666. return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
  667. }
  668. DEVICE_ATTR_RO(ana_grpid);
  669. static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
  670. char *buf)
  671. {
  672. struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
  673. return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
  674. }
  675. DEVICE_ATTR_RO(ana_state);
  676. static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
  677. struct nvme_ana_group_desc *desc, void *data)
  678. {
  679. struct nvme_ana_group_desc *dst = data;
  680. if (desc->grpid != dst->grpid)
  681. return 0;
  682. *dst = *desc;
  683. return -ENXIO; /* just break out of the loop */
  684. }
  685. void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
  686. {
  687. if (nvme_ctrl_use_ana(ns->ctrl)) {
  688. struct nvme_ana_group_desc desc = {
  689. .grpid = anagrpid,
  690. .state = 0,
  691. };
  692. mutex_lock(&ns->ctrl->ana_lock);
  693. ns->ana_grpid = le32_to_cpu(anagrpid);
  694. nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
  695. mutex_unlock(&ns->ctrl->ana_lock);
  696. if (desc.state) {
  697. /* found the group desc: update */
  698. nvme_update_ns_ana_state(&desc, ns);
  699. } else {
  700. /* group desc not found: trigger a re-read */
  701. set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  702. queue_work(nvme_wq, &ns->ctrl->ana_work);
  703. }
  704. } else {
  705. ns->ana_state = NVME_ANA_OPTIMIZED;
  706. nvme_mpath_set_live(ns);
  707. }
  708. if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
  709. blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
  710. ns->head->disk->queue);
  711. #ifdef CONFIG_BLK_DEV_ZONED
  712. if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
  713. ns->head->disk->nr_zones = ns->disk->nr_zones;
  714. #endif
  715. }
  716. void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
  717. {
  718. if (!head->disk)
  719. return;
  720. kblockd_schedule_work(&head->requeue_work);
  721. if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
  722. nvme_cdev_del(&head->cdev, &head->cdev_device);
  723. del_gendisk(head->disk);
  724. }
  725. }
  726. void nvme_mpath_remove_disk(struct nvme_ns_head *head)
  727. {
  728. if (!head->disk)
  729. return;
  730. /* make sure all pending bios are cleaned up */
  731. kblockd_schedule_work(&head->requeue_work);
  732. flush_work(&head->requeue_work);
  733. put_disk(head->disk);
  734. }
  735. void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
  736. {
  737. mutex_init(&ctrl->ana_lock);
  738. timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
  739. INIT_WORK(&ctrl->ana_work, nvme_ana_work);
  740. }
  741. int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
  742. {
  743. size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
  744. size_t ana_log_size;
  745. int error = 0;
  746. /* check if multipath is enabled and we have the capability */
  747. if (!multipath || !ctrl->subsys ||
  748. !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
  749. return 0;
  750. if (!ctrl->max_namespaces ||
  751. ctrl->max_namespaces > le32_to_cpu(id->nn)) {
  752. dev_err(ctrl->device,
  753. "Invalid MNAN value %u\n", ctrl->max_namespaces);
  754. return -EINVAL;
  755. }
  756. ctrl->anacap = id->anacap;
  757. ctrl->anatt = id->anatt;
  758. ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
  759. ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
  760. ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
  761. ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
  762. ctrl->max_namespaces * sizeof(__le32);
  763. if (ana_log_size > max_transfer_size) {
  764. dev_err(ctrl->device,
  765. "ANA log page size (%zd) larger than MDTS (%zd).\n",
  766. ana_log_size, max_transfer_size);
  767. dev_err(ctrl->device, "disabling ANA support.\n");
  768. goto out_uninit;
  769. }
  770. if (ana_log_size > ctrl->ana_log_size) {
  771. nvme_mpath_stop(ctrl);
  772. nvme_mpath_uninit(ctrl);
  773. ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
  774. if (!ctrl->ana_log_buf)
  775. return -ENOMEM;
  776. }
  777. ctrl->ana_log_size = ana_log_size;
  778. error = nvme_read_ana_log(ctrl);
  779. if (error)
  780. goto out_uninit;
  781. return 0;
  782. out_uninit:
  783. nvme_mpath_uninit(ctrl);
  784. return error;
  785. }
  786. void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
  787. {
  788. kvfree(ctrl->ana_log_buf);
  789. ctrl->ana_log_buf = NULL;
  790. ctrl->ana_log_size = 0;
  791. }