bdev.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 1991, 1992 Linus Torvalds
  4. * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE
  5. * Copyright (C) 2016 - 2020 Christoph Hellwig
  6. */
  7. #include <linux/init.h>
  8. #include <linux/mm.h>
  9. #include <linux/slab.h>
  10. #include <linux/kmod.h>
  11. #include <linux/major.h>
  12. #include <linux/device_cgroup.h>
  13. #include <linux/blkdev.h>
  14. #include <linux/blk-integrity.h>
  15. #include <linux/backing-dev.h>
  16. #include <linux/module.h>
  17. #include <linux/blkpg.h>
  18. #include <linux/magic.h>
  19. #include <linux/buffer_head.h>
  20. #include <linux/swap.h>
  21. #include <linux/writeback.h>
  22. #include <linux/mount.h>
  23. #include <linux/pseudo_fs.h>
  24. #include <linux/uio.h>
  25. #include <linux/namei.h>
  26. #include <linux/cleancache.h>
  27. #include <linux/part_stat.h>
  28. #include <linux/uaccess.h>
  29. #include <linux/stat.h>
  30. #include "../fs/internal.h"
  31. #include "blk.h"
  32. struct bdev_inode {
  33. struct block_device bdev;
  34. struct inode vfs_inode;
  35. };
  36. static inline struct bdev_inode *BDEV_I(struct inode *inode)
  37. {
  38. return container_of(inode, struct bdev_inode, vfs_inode);
  39. }
  40. struct block_device *I_BDEV(struct inode *inode)
  41. {
  42. return &BDEV_I(inode)->bdev;
  43. }
  44. EXPORT_SYMBOL(I_BDEV);
  45. static void bdev_write_inode(struct block_device *bdev)
  46. {
  47. struct inode *inode = bdev->bd_inode;
  48. int ret;
  49. spin_lock(&inode->i_lock);
  50. while (inode->i_state & I_DIRTY) {
  51. spin_unlock(&inode->i_lock);
  52. ret = write_inode_now(inode, true);
  53. if (ret)
  54. pr_warn_ratelimited(
  55. "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
  56. bdev, ret);
  57. spin_lock(&inode->i_lock);
  58. }
  59. spin_unlock(&inode->i_lock);
  60. }
  61. /* Kill _all_ buffers and pagecache , dirty or not.. */
  62. static void kill_bdev(struct block_device *bdev)
  63. {
  64. struct address_space *mapping = bdev->bd_inode->i_mapping;
  65. if (mapping_empty(mapping))
  66. return;
  67. invalidate_bh_lrus();
  68. truncate_inode_pages(mapping, 0);
  69. }
  70. /* Invalidate clean unused buffers and pagecache. */
  71. void invalidate_bdev(struct block_device *bdev)
  72. {
  73. struct address_space *mapping = bdev->bd_inode->i_mapping;
  74. if (mapping->nrpages) {
  75. invalidate_bh_lrus();
  76. lru_add_drain_all(); /* make sure all lru add caches are flushed */
  77. invalidate_mapping_pages(mapping, 0, -1);
  78. }
  79. /* 99% of the time, we don't need to flush the cleancache on the bdev.
  80. * But, for the strange corners, lets be cautious
  81. */
  82. cleancache_invalidate_inode(mapping);
  83. }
  84. EXPORT_SYMBOL(invalidate_bdev);
  85. /*
  86. * Drop all buffers & page cache for given bdev range. This function bails
  87. * with error if bdev has other exclusive owner (such as filesystem).
  88. */
  89. int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
  90. loff_t lstart, loff_t lend)
  91. {
  92. /*
  93. * If we don't hold exclusive handle for the device, upgrade to it
  94. * while we discard the buffer cache to avoid discarding buffers
  95. * under live filesystem.
  96. */
  97. if (!(mode & FMODE_EXCL)) {
  98. int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
  99. if (err)
  100. goto invalidate;
  101. }
  102. truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
  103. if (!(mode & FMODE_EXCL))
  104. bd_abort_claiming(bdev, truncate_bdev_range);
  105. return 0;
  106. invalidate:
  107. /*
  108. * Someone else has handle exclusively open. Try invalidating instead.
  109. * The 'end' argument is inclusive so the rounding is safe.
  110. */
  111. return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
  112. lstart >> PAGE_SHIFT,
  113. lend >> PAGE_SHIFT);
  114. }
  115. static void set_init_blocksize(struct block_device *bdev)
  116. {
  117. unsigned int bsize = bdev_logical_block_size(bdev);
  118. loff_t size = i_size_read(bdev->bd_inode);
  119. while (bsize < PAGE_SIZE) {
  120. if (size & bsize)
  121. break;
  122. bsize <<= 1;
  123. }
  124. bdev->bd_inode->i_blkbits = blksize_bits(bsize);
  125. }
  126. int set_blocksize(struct block_device *bdev, int size)
  127. {
  128. /* Size must be a power of two, and between 512 and PAGE_SIZE */
  129. if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
  130. return -EINVAL;
  131. /* Size cannot be smaller than the size supported by the device */
  132. if (size < bdev_logical_block_size(bdev))
  133. return -EINVAL;
  134. /* Don't change the size if it is same as current */
  135. if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
  136. sync_blockdev(bdev);
  137. bdev->bd_inode->i_blkbits = blksize_bits(size);
  138. kill_bdev(bdev);
  139. }
  140. return 0;
  141. }
  142. EXPORT_SYMBOL(set_blocksize);
  143. int sb_set_blocksize(struct super_block *sb, int size)
  144. {
  145. if (set_blocksize(sb->s_bdev, size))
  146. return 0;
  147. /* If we get here, we know size is power of two
  148. * and it's value is between 512 and PAGE_SIZE */
  149. sb->s_blocksize = size;
  150. sb->s_blocksize_bits = blksize_bits(size);
  151. return sb->s_blocksize;
  152. }
  153. EXPORT_SYMBOL(sb_set_blocksize);
  154. int sb_min_blocksize(struct super_block *sb, int size)
  155. {
  156. int minsize = bdev_logical_block_size(sb->s_bdev);
  157. if (size < minsize)
  158. size = minsize;
  159. return sb_set_blocksize(sb, size);
  160. }
  161. EXPORT_SYMBOL(sb_min_blocksize);
  162. int sync_blockdev_nowait(struct block_device *bdev)
  163. {
  164. if (!bdev)
  165. return 0;
  166. return filemap_flush(bdev->bd_inode->i_mapping);
  167. }
  168. EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
  169. /*
  170. * Write out and wait upon all the dirty data associated with a block
  171. * device via its mapping. Does not take the superblock lock.
  172. */
  173. int sync_blockdev(struct block_device *bdev)
  174. {
  175. if (!bdev)
  176. return 0;
  177. return filemap_write_and_wait(bdev->bd_inode->i_mapping);
  178. }
  179. EXPORT_SYMBOL(sync_blockdev);
  180. int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
  181. {
  182. return filemap_write_and_wait_range(bdev->bd_inode->i_mapping,
  183. lstart, lend);
  184. }
  185. EXPORT_SYMBOL(sync_blockdev_range);
  186. /*
  187. * Write out and wait upon all dirty data associated with this
  188. * device. Filesystem data as well as the underlying block
  189. * device. Takes the superblock lock.
  190. */
  191. int fsync_bdev(struct block_device *bdev)
  192. {
  193. struct super_block *sb = get_super(bdev);
  194. if (sb) {
  195. int res = sync_filesystem(sb);
  196. drop_super(sb);
  197. return res;
  198. }
  199. return sync_blockdev(bdev);
  200. }
  201. EXPORT_SYMBOL(fsync_bdev);
  202. /**
  203. * freeze_bdev -- lock a filesystem and force it into a consistent state
  204. * @bdev: blockdevice to lock
  205. *
  206. * If a superblock is found on this device, we take the s_umount semaphore
  207. * on it to make sure nobody unmounts until the snapshot creation is done.
  208. * The reference counter (bd_fsfreeze_count) guarantees that only the last
  209. * unfreeze process can unfreeze the frozen filesystem actually when multiple
  210. * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
  211. * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
  212. * actually.
  213. */
  214. int freeze_bdev(struct block_device *bdev)
  215. {
  216. struct super_block *sb;
  217. int error = 0;
  218. mutex_lock(&bdev->bd_fsfreeze_mutex);
  219. if (++bdev->bd_fsfreeze_count > 1)
  220. goto done;
  221. sb = get_active_super(bdev);
  222. if (!sb)
  223. goto sync;
  224. if (sb->s_op->freeze_super)
  225. error = sb->s_op->freeze_super(sb);
  226. else
  227. error = freeze_super(sb);
  228. deactivate_super(sb);
  229. if (error) {
  230. bdev->bd_fsfreeze_count--;
  231. goto done;
  232. }
  233. bdev->bd_fsfreeze_sb = sb;
  234. sync:
  235. sync_blockdev(bdev);
  236. done:
  237. mutex_unlock(&bdev->bd_fsfreeze_mutex);
  238. return error;
  239. }
  240. EXPORT_SYMBOL(freeze_bdev);
  241. /**
  242. * thaw_bdev -- unlock filesystem
  243. * @bdev: blockdevice to unlock
  244. *
  245. * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  246. */
  247. int thaw_bdev(struct block_device *bdev)
  248. {
  249. struct super_block *sb;
  250. int error = -EINVAL;
  251. mutex_lock(&bdev->bd_fsfreeze_mutex);
  252. if (!bdev->bd_fsfreeze_count)
  253. goto out;
  254. error = 0;
  255. if (--bdev->bd_fsfreeze_count > 0)
  256. goto out;
  257. sb = bdev->bd_fsfreeze_sb;
  258. if (!sb)
  259. goto out;
  260. if (sb->s_op->thaw_super)
  261. error = sb->s_op->thaw_super(sb);
  262. else
  263. error = thaw_super(sb);
  264. if (error)
  265. bdev->bd_fsfreeze_count++;
  266. else
  267. bdev->bd_fsfreeze_sb = NULL;
  268. out:
  269. mutex_unlock(&bdev->bd_fsfreeze_mutex);
  270. return error;
  271. }
  272. EXPORT_SYMBOL(thaw_bdev);
  273. /**
  274. * bdev_read_page() - Start reading a page from a block device
  275. * @bdev: The device to read the page from
  276. * @sector: The offset on the device to read the page to (need not be aligned)
  277. * @page: The page to read
  278. *
  279. * On entry, the page should be locked. It will be unlocked when the page
  280. * has been read. If the block driver implements rw_page synchronously,
  281. * that will be true on exit from this function, but it need not be.
  282. *
  283. * Errors returned by this function are usually "soft", eg out of memory, or
  284. * queue full; callers should try a different route to read this page rather
  285. * than propagate an error back up the stack.
  286. *
  287. * Return: negative errno if an error occurs, 0 if submission was successful.
  288. */
  289. int bdev_read_page(struct block_device *bdev, sector_t sector,
  290. struct page *page)
  291. {
  292. const struct block_device_operations *ops = bdev->bd_disk->fops;
  293. int result = -EOPNOTSUPP;
  294. if (!ops->rw_page || bdev_get_integrity(bdev))
  295. return result;
  296. result = blk_queue_enter(bdev_get_queue(bdev), 0);
  297. if (result)
  298. return result;
  299. result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
  300. REQ_OP_READ);
  301. blk_queue_exit(bdev_get_queue(bdev));
  302. return result;
  303. }
  304. /**
  305. * bdev_write_page() - Start writing a page to a block device
  306. * @bdev: The device to write the page to
  307. * @sector: The offset on the device to write the page to (need not be aligned)
  308. * @page: The page to write
  309. * @wbc: The writeback_control for the write
  310. *
  311. * On entry, the page should be locked and not currently under writeback.
  312. * On exit, if the write started successfully, the page will be unlocked and
  313. * under writeback. If the write failed already (eg the driver failed to
  314. * queue the page to the device), the page will still be locked. If the
  315. * caller is a ->writepage implementation, it will need to unlock the page.
  316. *
  317. * Errors returned by this function are usually "soft", eg out of memory, or
  318. * queue full; callers should try a different route to write this page rather
  319. * than propagate an error back up the stack.
  320. *
  321. * Return: negative errno if an error occurs, 0 if submission was successful.
  322. */
  323. int bdev_write_page(struct block_device *bdev, sector_t sector,
  324. struct page *page, struct writeback_control *wbc)
  325. {
  326. int result;
  327. const struct block_device_operations *ops = bdev->bd_disk->fops;
  328. if (!ops->rw_page || bdev_get_integrity(bdev))
  329. return -EOPNOTSUPP;
  330. result = blk_queue_enter(bdev_get_queue(bdev), 0);
  331. if (result)
  332. return result;
  333. set_page_writeback(page);
  334. result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
  335. REQ_OP_WRITE);
  336. if (result) {
  337. end_page_writeback(page);
  338. } else {
  339. clean_page_buffers(page);
  340. unlock_page(page);
  341. }
  342. blk_queue_exit(bdev_get_queue(bdev));
  343. return result;
  344. }
  345. /*
  346. * pseudo-fs
  347. */
  348. static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
  349. static struct kmem_cache * bdev_cachep __read_mostly;
  350. static struct inode *bdev_alloc_inode(struct super_block *sb)
  351. {
  352. struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
  353. if (!ei)
  354. return NULL;
  355. memset(&ei->bdev, 0, sizeof(ei->bdev));
  356. return &ei->vfs_inode;
  357. }
  358. static void bdev_free_inode(struct inode *inode)
  359. {
  360. struct block_device *bdev = I_BDEV(inode);
  361. free_percpu(bdev->bd_stats);
  362. kfree(bdev->bd_meta_info);
  363. if (!bdev_is_partition(bdev)) {
  364. if (bdev->bd_disk && bdev->bd_disk->bdi)
  365. bdi_put(bdev->bd_disk->bdi);
  366. kfree(bdev->bd_disk);
  367. }
  368. if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
  369. blk_free_ext_minor(MINOR(bdev->bd_dev));
  370. kmem_cache_free(bdev_cachep, BDEV_I(inode));
  371. }
  372. static void init_once(void *data)
  373. {
  374. struct bdev_inode *ei = data;
  375. inode_init_once(&ei->vfs_inode);
  376. }
  377. static void bdev_evict_inode(struct inode *inode)
  378. {
  379. truncate_inode_pages_final(&inode->i_data);
  380. invalidate_inode_buffers(inode); /* is it needed here? */
  381. clear_inode(inode);
  382. }
  383. static const struct super_operations bdev_sops = {
  384. .statfs = simple_statfs,
  385. .alloc_inode = bdev_alloc_inode,
  386. .free_inode = bdev_free_inode,
  387. .drop_inode = generic_delete_inode,
  388. .evict_inode = bdev_evict_inode,
  389. };
  390. static int bd_init_fs_context(struct fs_context *fc)
  391. {
  392. struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
  393. if (!ctx)
  394. return -ENOMEM;
  395. fc->s_iflags |= SB_I_CGROUPWB;
  396. ctx->ops = &bdev_sops;
  397. return 0;
  398. }
  399. static struct file_system_type bd_type = {
  400. .name = "bdev",
  401. .init_fs_context = bd_init_fs_context,
  402. .kill_sb = kill_anon_super,
  403. };
  404. struct super_block *blockdev_superblock __read_mostly;
  405. EXPORT_SYMBOL_GPL(blockdev_superblock);
  406. void __init bdev_cache_init(void)
  407. {
  408. int err;
  409. static struct vfsmount *bd_mnt;
  410. bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
  411. 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
  412. SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
  413. init_once);
  414. err = register_filesystem(&bd_type);
  415. if (err)
  416. panic("Cannot register bdev pseudo-fs");
  417. bd_mnt = kern_mount(&bd_type);
  418. if (IS_ERR(bd_mnt))
  419. panic("Cannot create bdev pseudo-fs");
  420. blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
  421. }
  422. struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
  423. {
  424. struct block_device *bdev;
  425. struct inode *inode;
  426. inode = new_inode(blockdev_superblock);
  427. if (!inode)
  428. return NULL;
  429. inode->i_mode = S_IFBLK;
  430. inode->i_rdev = 0;
  431. inode->i_data.a_ops = &def_blk_aops;
  432. mapping_set_gfp_mask(&inode->i_data, GFP_USER);
  433. bdev = I_BDEV(inode);
  434. mutex_init(&bdev->bd_fsfreeze_mutex);
  435. spin_lock_init(&bdev->bd_size_lock);
  436. bdev->bd_partno = partno;
  437. bdev->bd_inode = inode;
  438. bdev->bd_queue = disk->queue;
  439. bdev->bd_stats = alloc_percpu(struct disk_stats);
  440. if (!bdev->bd_stats) {
  441. iput(inode);
  442. return NULL;
  443. }
  444. bdev->bd_disk = disk;
  445. return bdev;
  446. }
  447. void bdev_add(struct block_device *bdev, dev_t dev)
  448. {
  449. bdev->bd_dev = dev;
  450. bdev->bd_inode->i_rdev = dev;
  451. bdev->bd_inode->i_ino = dev;
  452. insert_inode_hash(bdev->bd_inode);
  453. }
  454. long nr_blockdev_pages(void)
  455. {
  456. struct inode *inode;
  457. long ret = 0;
  458. spin_lock(&blockdev_superblock->s_inode_list_lock);
  459. list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
  460. ret += inode->i_mapping->nrpages;
  461. spin_unlock(&blockdev_superblock->s_inode_list_lock);
  462. return ret;
  463. }
  464. /**
  465. * bd_may_claim - test whether a block device can be claimed
  466. * @bdev: block device of interest
  467. * @whole: whole block device containing @bdev, may equal @bdev
  468. * @holder: holder trying to claim @bdev
  469. *
  470. * Test whether @bdev can be claimed by @holder.
  471. *
  472. * CONTEXT:
  473. * spin_lock(&bdev_lock).
  474. *
  475. * RETURNS:
  476. * %true if @bdev can be claimed, %false otherwise.
  477. */
  478. static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
  479. void *holder)
  480. {
  481. if (bdev->bd_holder == holder)
  482. return true; /* already a holder */
  483. else if (bdev->bd_holder != NULL)
  484. return false; /* held by someone else */
  485. else if (whole == bdev)
  486. return true; /* is a whole device which isn't held */
  487. else if (whole->bd_holder == bd_may_claim)
  488. return true; /* is a partition of a device that is being partitioned */
  489. else if (whole->bd_holder != NULL)
  490. return false; /* is a partition of a held device */
  491. else
  492. return true; /* is a partition of an un-held device */
  493. }
  494. /**
  495. * bd_prepare_to_claim - claim a block device
  496. * @bdev: block device of interest
  497. * @holder: holder trying to claim @bdev
  498. *
  499. * Claim @bdev. This function fails if @bdev is already claimed by another
  500. * holder and waits if another claiming is in progress. return, the caller
  501. * has ownership of bd_claiming and bd_holder[s].
  502. *
  503. * RETURNS:
  504. * 0 if @bdev can be claimed, -EBUSY otherwise.
  505. */
  506. int bd_prepare_to_claim(struct block_device *bdev, void *holder)
  507. {
  508. struct block_device *whole = bdev_whole(bdev);
  509. if (WARN_ON_ONCE(!holder))
  510. return -EINVAL;
  511. retry:
  512. spin_lock(&bdev_lock);
  513. /* if someone else claimed, fail */
  514. if (!bd_may_claim(bdev, whole, holder)) {
  515. spin_unlock(&bdev_lock);
  516. return -EBUSY;
  517. }
  518. /* if claiming is already in progress, wait for it to finish */
  519. if (whole->bd_claiming) {
  520. wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
  521. DEFINE_WAIT(wait);
  522. prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
  523. spin_unlock(&bdev_lock);
  524. schedule();
  525. finish_wait(wq, &wait);
  526. goto retry;
  527. }
  528. /* yay, all mine */
  529. whole->bd_claiming = holder;
  530. spin_unlock(&bdev_lock);
  531. return 0;
  532. }
  533. EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
  534. static void bd_clear_claiming(struct block_device *whole, void *holder)
  535. {
  536. lockdep_assert_held(&bdev_lock);
  537. /* tell others that we're done */
  538. BUG_ON(whole->bd_claiming != holder);
  539. whole->bd_claiming = NULL;
  540. wake_up_bit(&whole->bd_claiming, 0);
  541. }
  542. /**
  543. * bd_finish_claiming - finish claiming of a block device
  544. * @bdev: block device of interest
  545. * @holder: holder that has claimed @bdev
  546. *
  547. * Finish exclusive open of a block device. Mark the device as exlusively
  548. * open by the holder and wake up all waiters for exclusive open to finish.
  549. */
  550. static void bd_finish_claiming(struct block_device *bdev, void *holder)
  551. {
  552. struct block_device *whole = bdev_whole(bdev);
  553. spin_lock(&bdev_lock);
  554. BUG_ON(!bd_may_claim(bdev, whole, holder));
  555. /*
  556. * Note that for a whole device bd_holders will be incremented twice,
  557. * and bd_holder will be set to bd_may_claim before being set to holder
  558. */
  559. whole->bd_holders++;
  560. whole->bd_holder = bd_may_claim;
  561. bdev->bd_holders++;
  562. bdev->bd_holder = holder;
  563. bd_clear_claiming(whole, holder);
  564. spin_unlock(&bdev_lock);
  565. }
  566. /**
  567. * bd_abort_claiming - abort claiming of a block device
  568. * @bdev: block device of interest
  569. * @holder: holder that has claimed @bdev
  570. *
  571. * Abort claiming of a block device when the exclusive open failed. This can be
  572. * also used when exclusive open is not actually desired and we just needed
  573. * to block other exclusive openers for a while.
  574. */
  575. void bd_abort_claiming(struct block_device *bdev, void *holder)
  576. {
  577. spin_lock(&bdev_lock);
  578. bd_clear_claiming(bdev_whole(bdev), holder);
  579. spin_unlock(&bdev_lock);
  580. }
  581. EXPORT_SYMBOL(bd_abort_claiming);
  582. static void blkdev_flush_mapping(struct block_device *bdev)
  583. {
  584. WARN_ON_ONCE(bdev->bd_holders);
  585. sync_blockdev(bdev);
  586. kill_bdev(bdev);
  587. bdev_write_inode(bdev);
  588. }
  589. static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
  590. {
  591. struct gendisk *disk = bdev->bd_disk;
  592. int ret;
  593. if (disk->fops->open) {
  594. ret = disk->fops->open(bdev, mode);
  595. if (ret) {
  596. /* avoid ghost partitions on a removed medium */
  597. if (ret == -ENOMEDIUM &&
  598. test_bit(GD_NEED_PART_SCAN, &disk->state))
  599. bdev_disk_changed(disk, true);
  600. return ret;
  601. }
  602. }
  603. if (!atomic_read(&bdev->bd_openers))
  604. set_init_blocksize(bdev);
  605. if (test_bit(GD_NEED_PART_SCAN, &disk->state))
  606. bdev_disk_changed(disk, false);
  607. atomic_inc(&bdev->bd_openers);
  608. return 0;
  609. }
  610. static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
  611. {
  612. if (atomic_dec_and_test(&bdev->bd_openers))
  613. blkdev_flush_mapping(bdev);
  614. if (bdev->bd_disk->fops->release)
  615. bdev->bd_disk->fops->release(bdev->bd_disk, mode);
  616. }
  617. static int blkdev_get_part(struct block_device *part, fmode_t mode)
  618. {
  619. struct gendisk *disk = part->bd_disk;
  620. int ret;
  621. if (atomic_read(&part->bd_openers))
  622. goto done;
  623. ret = blkdev_get_whole(bdev_whole(part), mode);
  624. if (ret)
  625. return ret;
  626. ret = -ENXIO;
  627. if (!bdev_nr_sectors(part))
  628. goto out_blkdev_put;
  629. disk->open_partitions++;
  630. set_init_blocksize(part);
  631. done:
  632. atomic_inc(&part->bd_openers);
  633. return 0;
  634. out_blkdev_put:
  635. blkdev_put_whole(bdev_whole(part), mode);
  636. return ret;
  637. }
  638. static void blkdev_put_part(struct block_device *part, fmode_t mode)
  639. {
  640. struct block_device *whole = bdev_whole(part);
  641. if (!atomic_dec_and_test(&part->bd_openers))
  642. return;
  643. blkdev_flush_mapping(part);
  644. whole->bd_disk->open_partitions--;
  645. blkdev_put_whole(whole, mode);
  646. }
  647. struct block_device *blkdev_get_no_open(dev_t dev)
  648. {
  649. struct block_device *bdev;
  650. struct inode *inode;
  651. inode = ilookup(blockdev_superblock, dev);
  652. if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
  653. blk_request_module(dev);
  654. inode = ilookup(blockdev_superblock, dev);
  655. if (inode)
  656. pr_warn_ratelimited(
  657. "block device autoloading is deprecated and will be removed.\n");
  658. }
  659. if (!inode)
  660. return NULL;
  661. /* switch from the inode reference to a device mode one: */
  662. bdev = &BDEV_I(inode)->bdev;
  663. if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
  664. bdev = NULL;
  665. iput(inode);
  666. return bdev;
  667. }
  668. void blkdev_put_no_open(struct block_device *bdev)
  669. {
  670. put_device(&bdev->bd_device);
  671. }
  672. /**
  673. * blkdev_get_by_dev - open a block device by device number
  674. * @dev: device number of block device to open
  675. * @mode: FMODE_* mask
  676. * @holder: exclusive holder identifier
  677. *
  678. * Open the block device described by device number @dev. If @mode includes
  679. * %FMODE_EXCL, the block device is opened with exclusive access. Specifying
  680. * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
  681. * the same @holder.
  682. *
  683. * Use this interface ONLY if you really do not have anything better - i.e. when
  684. * you are behind a truly sucky interface and all you are given is a device
  685. * number. Everything else should use blkdev_get_by_path().
  686. *
  687. * CONTEXT:
  688. * Might sleep.
  689. *
  690. * RETURNS:
  691. * Reference to the block_device on success, ERR_PTR(-errno) on failure.
  692. */
  693. struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
  694. {
  695. bool unblock_events = true;
  696. struct block_device *bdev;
  697. struct gendisk *disk;
  698. int ret;
  699. ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
  700. MAJOR(dev), MINOR(dev),
  701. ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
  702. ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
  703. if (ret)
  704. return ERR_PTR(ret);
  705. bdev = blkdev_get_no_open(dev);
  706. if (!bdev)
  707. return ERR_PTR(-ENXIO);
  708. disk = bdev->bd_disk;
  709. if (mode & FMODE_EXCL) {
  710. ret = bd_prepare_to_claim(bdev, holder);
  711. if (ret)
  712. goto put_blkdev;
  713. }
  714. disk_block_events(disk);
  715. mutex_lock(&disk->open_mutex);
  716. ret = -ENXIO;
  717. if (!disk_live(disk))
  718. goto abort_claiming;
  719. if (!try_module_get(disk->fops->owner))
  720. goto abort_claiming;
  721. if (bdev_is_partition(bdev))
  722. ret = blkdev_get_part(bdev, mode);
  723. else
  724. ret = blkdev_get_whole(bdev, mode);
  725. if (ret)
  726. goto put_module;
  727. if (mode & FMODE_EXCL) {
  728. bd_finish_claiming(bdev, holder);
  729. /*
  730. * Block event polling for write claims if requested. Any write
  731. * holder makes the write_holder state stick until all are
  732. * released. This is good enough and tracking individual
  733. * writeable reference is too fragile given the way @mode is
  734. * used in blkdev_get/put().
  735. */
  736. if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
  737. (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
  738. bdev->bd_write_holder = true;
  739. unblock_events = false;
  740. }
  741. }
  742. mutex_unlock(&disk->open_mutex);
  743. if (unblock_events)
  744. disk_unblock_events(disk);
  745. return bdev;
  746. put_module:
  747. module_put(disk->fops->owner);
  748. abort_claiming:
  749. if (mode & FMODE_EXCL)
  750. bd_abort_claiming(bdev, holder);
  751. mutex_unlock(&disk->open_mutex);
  752. disk_unblock_events(disk);
  753. put_blkdev:
  754. blkdev_put_no_open(bdev);
  755. return ERR_PTR(ret);
  756. }
  757. EXPORT_SYMBOL(blkdev_get_by_dev);
  758. /**
  759. * blkdev_get_by_path - open a block device by name
  760. * @path: path to the block device to open
  761. * @mode: FMODE_* mask
  762. * @holder: exclusive holder identifier
  763. *
  764. * Open the block device described by the device file at @path. If @mode
  765. * includes %FMODE_EXCL, the block device is opened with exclusive access.
  766. * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may
  767. * nest for the same @holder.
  768. *
  769. * CONTEXT:
  770. * Might sleep.
  771. *
  772. * RETURNS:
  773. * Reference to the block_device on success, ERR_PTR(-errno) on failure.
  774. */
  775. struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
  776. void *holder)
  777. {
  778. struct block_device *bdev;
  779. dev_t dev;
  780. int error;
  781. error = lookup_bdev(path, &dev);
  782. if (error)
  783. return ERR_PTR(error);
  784. bdev = blkdev_get_by_dev(dev, mode, holder);
  785. if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
  786. blkdev_put(bdev, mode);
  787. return ERR_PTR(-EACCES);
  788. }
  789. return bdev;
  790. }
  791. EXPORT_SYMBOL(blkdev_get_by_path);
  792. void blkdev_put(struct block_device *bdev, fmode_t mode)
  793. {
  794. struct gendisk *disk = bdev->bd_disk;
  795. /*
  796. * Sync early if it looks like we're the last one. If someone else
  797. * opens the block device between now and the decrement of bd_openers
  798. * then we did a sync that we didn't need to, but that's not the end
  799. * of the world and we want to avoid long (could be several minute)
  800. * syncs while holding the mutex.
  801. */
  802. if (atomic_read(&bdev->bd_openers) == 1)
  803. sync_blockdev(bdev);
  804. mutex_lock(&disk->open_mutex);
  805. if (mode & FMODE_EXCL) {
  806. struct block_device *whole = bdev_whole(bdev);
  807. bool bdev_free;
  808. /*
  809. * Release a claim on the device. The holder fields
  810. * are protected with bdev_lock. open_mutex is to
  811. * synchronize disk_holder unlinking.
  812. */
  813. spin_lock(&bdev_lock);
  814. WARN_ON_ONCE(--bdev->bd_holders < 0);
  815. WARN_ON_ONCE(--whole->bd_holders < 0);
  816. if ((bdev_free = !bdev->bd_holders))
  817. bdev->bd_holder = NULL;
  818. if (!whole->bd_holders)
  819. whole->bd_holder = NULL;
  820. spin_unlock(&bdev_lock);
  821. /*
  822. * If this was the last claim, remove holder link and
  823. * unblock evpoll if it was a write holder.
  824. */
  825. if (bdev_free && bdev->bd_write_holder) {
  826. disk_unblock_events(disk);
  827. bdev->bd_write_holder = false;
  828. }
  829. }
  830. /*
  831. * Trigger event checking and tell drivers to flush MEDIA_CHANGE
  832. * event. This is to ensure detection of media removal commanded
  833. * from userland - e.g. eject(1).
  834. */
  835. disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
  836. if (bdev_is_partition(bdev))
  837. blkdev_put_part(bdev, mode);
  838. else
  839. blkdev_put_whole(bdev, mode);
  840. mutex_unlock(&disk->open_mutex);
  841. module_put(disk->fops->owner);
  842. blkdev_put_no_open(bdev);
  843. }
  844. EXPORT_SYMBOL(blkdev_put);
  845. /**
  846. * lookup_bdev() - Look up a struct block_device by name.
  847. * @pathname: Name of the block device in the filesystem.
  848. * @dev: Pointer to the block device's dev_t, if found.
  849. *
  850. * Lookup the block device's dev_t at @pathname in the current
  851. * namespace if possible and return it in @dev.
  852. *
  853. * Context: May sleep.
  854. * Return: 0 if succeeded, negative errno otherwise.
  855. */
  856. int lookup_bdev(const char *pathname, dev_t *dev)
  857. {
  858. struct inode *inode;
  859. struct path path;
  860. int error;
  861. if (!pathname || !*pathname)
  862. return -EINVAL;
  863. error = kern_path(pathname, LOOKUP_FOLLOW, &path);
  864. if (error)
  865. return error;
  866. inode = d_backing_inode(path.dentry);
  867. error = -ENOTBLK;
  868. if (!S_ISBLK(inode->i_mode))
  869. goto out_path_put;
  870. error = -EACCES;
  871. if (!may_open_dev(&path))
  872. goto out_path_put;
  873. *dev = inode->i_rdev;
  874. error = 0;
  875. out_path_put:
  876. path_put(&path);
  877. return error;
  878. }
  879. EXPORT_SYMBOL(lookup_bdev);
  880. int __invalidate_device(struct block_device *bdev, bool kill_dirty)
  881. {
  882. struct super_block *sb = get_super(bdev);
  883. int res = 0;
  884. if (sb) {
  885. /*
  886. * no need to lock the super, get_super holds the
  887. * read mutex so the filesystem cannot go away
  888. * under us (->put_super runs with the write lock
  889. * hold).
  890. */
  891. shrink_dcache_sb(sb);
  892. res = invalidate_inodes(sb, kill_dirty);
  893. drop_super(sb);
  894. }
  895. invalidate_bdev(bdev);
  896. return res;
  897. }
  898. EXPORT_SYMBOL(__invalidate_device);
  899. void sync_bdevs(bool wait)
  900. {
  901. struct inode *inode, *old_inode = NULL;
  902. spin_lock(&blockdev_superblock->s_inode_list_lock);
  903. list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
  904. struct address_space *mapping = inode->i_mapping;
  905. struct block_device *bdev;
  906. spin_lock(&inode->i_lock);
  907. if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
  908. mapping->nrpages == 0) {
  909. spin_unlock(&inode->i_lock);
  910. continue;
  911. }
  912. __iget(inode);
  913. spin_unlock(&inode->i_lock);
  914. spin_unlock(&blockdev_superblock->s_inode_list_lock);
  915. /*
  916. * We hold a reference to 'inode' so it couldn't have been
  917. * removed from s_inodes list while we dropped the
  918. * s_inode_list_lock We cannot iput the inode now as we can
  919. * be holding the last reference and we cannot iput it under
  920. * s_inode_list_lock. So we keep the reference and iput it
  921. * later.
  922. */
  923. iput(old_inode);
  924. old_inode = inode;
  925. bdev = I_BDEV(inode);
  926. mutex_lock(&bdev->bd_disk->open_mutex);
  927. if (!atomic_read(&bdev->bd_openers)) {
  928. ; /* skip */
  929. } else if (wait) {
  930. /*
  931. * We keep the error status of individual mapping so
  932. * that applications can catch the writeback error using
  933. * fsync(2). See filemap_fdatawait_keep_errors() for
  934. * details.
  935. */
  936. filemap_fdatawait_keep_errors(inode->i_mapping);
  937. } else {
  938. filemap_fdatawrite(inode->i_mapping);
  939. }
  940. mutex_unlock(&bdev->bd_disk->open_mutex);
  941. spin_lock(&blockdev_superblock->s_inode_list_lock);
  942. }
  943. spin_unlock(&blockdev_superblock->s_inode_list_lock);
  944. iput(old_inode);
  945. }
  946. /*
  947. * Handle STATX_DIOALIGN for block devices.
  948. *
  949. * Note that the inode passed to this is the inode of a block device node file,
  950. * not the block device's internal inode. Therefore it is *not* valid to use
  951. * I_BDEV() here; the block device has to be looked up by i_rdev instead.
  952. */
  953. void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
  954. {
  955. struct block_device *bdev;
  956. bdev = blkdev_get_no_open(inode->i_rdev);
  957. if (!bdev)
  958. return;
  959. stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
  960. stat->dio_offset_align = bdev_logical_block_size(bdev);
  961. stat->result_mask |= STATX_DIOALIGN;
  962. blkdev_put_no_open(bdev);
  963. }