ordered-data.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/slab.h>
  6. #include <linux/blkdev.h>
  7. #include <linux/writeback.h>
  8. #include <linux/sched/mm.h>
  9. #include "misc.h"
  10. #include "ctree.h"
  11. #include "transaction.h"
  12. #include "btrfs_inode.h"
  13. #include "extent_io.h"
  14. #include "disk-io.h"
  15. #include "compression.h"
  16. #include "delalloc-space.h"
  17. #include "qgroup.h"
  18. #include "subpage.h"
  19. static struct kmem_cache *btrfs_ordered_extent_cache;
  20. static u64 entry_end(struct btrfs_ordered_extent *entry)
  21. {
  22. if (entry->file_offset + entry->num_bytes < entry->file_offset)
  23. return (u64)-1;
  24. return entry->file_offset + entry->num_bytes;
  25. }
  26. /* returns NULL if the insertion worked, or it returns the node it did find
  27. * in the tree
  28. */
  29. static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
  30. struct rb_node *node)
  31. {
  32. struct rb_node **p = &root->rb_node;
  33. struct rb_node *parent = NULL;
  34. struct btrfs_ordered_extent *entry;
  35. while (*p) {
  36. parent = *p;
  37. entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
  38. if (file_offset < entry->file_offset)
  39. p = &(*p)->rb_left;
  40. else if (file_offset >= entry_end(entry))
  41. p = &(*p)->rb_right;
  42. else
  43. return parent;
  44. }
  45. rb_link_node(node, parent, p);
  46. rb_insert_color(node, root);
  47. return NULL;
  48. }
  49. /*
  50. * look for a given offset in the tree, and if it can't be found return the
  51. * first lesser offset
  52. */
  53. static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
  54. struct rb_node **prev_ret)
  55. {
  56. struct rb_node *n = root->rb_node;
  57. struct rb_node *prev = NULL;
  58. struct rb_node *test;
  59. struct btrfs_ordered_extent *entry;
  60. struct btrfs_ordered_extent *prev_entry = NULL;
  61. while (n) {
  62. entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  63. prev = n;
  64. prev_entry = entry;
  65. if (file_offset < entry->file_offset)
  66. n = n->rb_left;
  67. else if (file_offset >= entry_end(entry))
  68. n = n->rb_right;
  69. else
  70. return n;
  71. }
  72. if (!prev_ret)
  73. return NULL;
  74. while (prev && file_offset >= entry_end(prev_entry)) {
  75. test = rb_next(prev);
  76. if (!test)
  77. break;
  78. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  79. rb_node);
  80. if (file_offset < entry_end(prev_entry))
  81. break;
  82. prev = test;
  83. }
  84. if (prev)
  85. prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
  86. rb_node);
  87. while (prev && file_offset < entry_end(prev_entry)) {
  88. test = rb_prev(prev);
  89. if (!test)
  90. break;
  91. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  92. rb_node);
  93. prev = test;
  94. }
  95. *prev_ret = prev;
  96. return NULL;
  97. }
  98. static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
  99. u64 len)
  100. {
  101. if (file_offset + len <= entry->file_offset ||
  102. entry->file_offset + entry->num_bytes <= file_offset)
  103. return 0;
  104. return 1;
  105. }
  106. /*
  107. * look find the first ordered struct that has this offset, otherwise
  108. * the first one less than this offset
  109. */
  110. static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  111. u64 file_offset)
  112. {
  113. struct rb_root *root = &tree->tree;
  114. struct rb_node *prev = NULL;
  115. struct rb_node *ret;
  116. struct btrfs_ordered_extent *entry;
  117. if (tree->last) {
  118. entry = rb_entry(tree->last, struct btrfs_ordered_extent,
  119. rb_node);
  120. if (in_range(file_offset, entry->file_offset, entry->num_bytes))
  121. return tree->last;
  122. }
  123. ret = __tree_search(root, file_offset, &prev);
  124. if (!ret)
  125. ret = prev;
  126. if (ret)
  127. tree->last = ret;
  128. return ret;
  129. }
  130. /**
  131. * Add an ordered extent to the per-inode tree.
  132. *
  133. * @inode: Inode that this extent is for.
  134. * @file_offset: Logical offset in file where the extent starts.
  135. * @num_bytes: Logical length of extent in file.
  136. * @ram_bytes: Full length of unencoded data.
  137. * @disk_bytenr: Offset of extent on disk.
  138. * @disk_num_bytes: Size of extent on disk.
  139. * @offset: Offset into unencoded data where file data starts.
  140. * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
  141. * @compress_type: Compression algorithm used for data.
  142. *
  143. * Most of these parameters correspond to &struct btrfs_file_extent_item. The
  144. * tree is given a single reference on the ordered extent that was inserted.
  145. *
  146. * Return: 0 or -ENOMEM.
  147. */
  148. int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
  149. u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
  150. u64 disk_num_bytes, u64 offset, unsigned flags,
  151. int compress_type)
  152. {
  153. struct btrfs_root *root = inode->root;
  154. struct btrfs_fs_info *fs_info = root->fs_info;
  155. struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
  156. struct rb_node *node;
  157. struct btrfs_ordered_extent *entry;
  158. int ret;
  159. if (flags &
  160. ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
  161. /* For nocow write, we can release the qgroup rsv right now */
  162. ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
  163. if (ret < 0)
  164. return ret;
  165. ret = 0;
  166. } else {
  167. /*
  168. * The ordered extent has reserved qgroup space, release now
  169. * and pass the reserved number for qgroup_record to free.
  170. */
  171. ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
  172. if (ret < 0)
  173. return ret;
  174. }
  175. entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
  176. if (!entry)
  177. return -ENOMEM;
  178. entry->file_offset = file_offset;
  179. entry->num_bytes = num_bytes;
  180. entry->ram_bytes = ram_bytes;
  181. entry->disk_bytenr = disk_bytenr;
  182. entry->disk_num_bytes = disk_num_bytes;
  183. entry->offset = offset;
  184. entry->bytes_left = num_bytes;
  185. entry->inode = igrab(&inode->vfs_inode);
  186. entry->compress_type = compress_type;
  187. entry->truncated_len = (u64)-1;
  188. entry->qgroup_rsv = ret;
  189. entry->physical = (u64)-1;
  190. ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
  191. entry->flags = flags;
  192. percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
  193. fs_info->delalloc_batch);
  194. /* one ref for the tree */
  195. refcount_set(&entry->refs, 1);
  196. init_waitqueue_head(&entry->wait);
  197. INIT_LIST_HEAD(&entry->list);
  198. INIT_LIST_HEAD(&entry->log_list);
  199. INIT_LIST_HEAD(&entry->root_extent_list);
  200. INIT_LIST_HEAD(&entry->work_list);
  201. init_completion(&entry->completion);
  202. trace_btrfs_ordered_extent_add(inode, entry);
  203. spin_lock_irq(&tree->lock);
  204. node = tree_insert(&tree->tree, file_offset,
  205. &entry->rb_node);
  206. if (node)
  207. btrfs_panic(fs_info, -EEXIST,
  208. "inconsistency in ordered tree at offset %llu",
  209. file_offset);
  210. spin_unlock_irq(&tree->lock);
  211. spin_lock(&root->ordered_extent_lock);
  212. list_add_tail(&entry->root_extent_list,
  213. &root->ordered_extents);
  214. root->nr_ordered_extents++;
  215. if (root->nr_ordered_extents == 1) {
  216. spin_lock(&fs_info->ordered_root_lock);
  217. BUG_ON(!list_empty(&root->ordered_root));
  218. list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
  219. spin_unlock(&fs_info->ordered_root_lock);
  220. }
  221. spin_unlock(&root->ordered_extent_lock);
  222. /*
  223. * We don't need the count_max_extents here, we can assume that all of
  224. * that work has been done at higher layers, so this is truly the
  225. * smallest the extent is going to get.
  226. */
  227. spin_lock(&inode->lock);
  228. btrfs_mod_outstanding_extents(inode, 1);
  229. spin_unlock(&inode->lock);
  230. return 0;
  231. }
  232. /*
  233. * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
  234. * when an ordered extent is finished. If the list covers more than one
  235. * ordered extent, it is split across multiples.
  236. */
  237. void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
  238. struct btrfs_ordered_sum *sum)
  239. {
  240. struct btrfs_ordered_inode_tree *tree;
  241. tree = &BTRFS_I(entry->inode)->ordered_tree;
  242. spin_lock_irq(&tree->lock);
  243. list_add_tail(&sum->list, &entry->list);
  244. spin_unlock_irq(&tree->lock);
  245. }
  246. static void finish_ordered_fn(struct btrfs_work *work)
  247. {
  248. struct btrfs_ordered_extent *ordered_extent;
  249. ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
  250. btrfs_finish_ordered_io(ordered_extent);
  251. }
  252. /*
  253. * Mark all ordered extents io inside the specified range finished.
  254. *
  255. * @page: The involved page for the operation.
  256. * For uncompressed buffered IO, the page status also needs to be
  257. * updated to indicate whether the pending ordered io is finished.
  258. * Can be NULL for direct IO and compressed write.
  259. * For these cases, callers are ensured they won't execute the
  260. * endio function twice.
  261. *
  262. * This function is called for endio, thus the range must have ordered
  263. * extent(s) covering it.
  264. */
  265. void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
  266. struct page *page, u64 file_offset,
  267. u64 num_bytes, bool uptodate)
  268. {
  269. struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
  270. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  271. struct btrfs_workqueue *wq;
  272. struct rb_node *node;
  273. struct btrfs_ordered_extent *entry = NULL;
  274. unsigned long flags;
  275. u64 cur = file_offset;
  276. if (btrfs_is_free_space_inode(inode))
  277. wq = fs_info->endio_freespace_worker;
  278. else
  279. wq = fs_info->endio_write_workers;
  280. if (page)
  281. ASSERT(page->mapping && page_offset(page) <= file_offset &&
  282. file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
  283. spin_lock_irqsave(&tree->lock, flags);
  284. while (cur < file_offset + num_bytes) {
  285. u64 entry_end;
  286. u64 end;
  287. u32 len;
  288. node = tree_search(tree, cur);
  289. /* No ordered extents at all */
  290. if (!node)
  291. break;
  292. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  293. entry_end = entry->file_offset + entry->num_bytes;
  294. /*
  295. * |<-- OE --->| |
  296. * cur
  297. * Go to next OE.
  298. */
  299. if (cur >= entry_end) {
  300. node = rb_next(node);
  301. /* No more ordered extents, exit */
  302. if (!node)
  303. break;
  304. entry = rb_entry(node, struct btrfs_ordered_extent,
  305. rb_node);
  306. /* Go to next ordered extent and continue */
  307. cur = entry->file_offset;
  308. continue;
  309. }
  310. /*
  311. * | |<--- OE --->|
  312. * cur
  313. * Go to the start of OE.
  314. */
  315. if (cur < entry->file_offset) {
  316. cur = entry->file_offset;
  317. continue;
  318. }
  319. /*
  320. * Now we are definitely inside one ordered extent.
  321. *
  322. * |<--- OE --->|
  323. * |
  324. * cur
  325. */
  326. end = min(entry->file_offset + entry->num_bytes,
  327. file_offset + num_bytes) - 1;
  328. ASSERT(end + 1 - cur < U32_MAX);
  329. len = end + 1 - cur;
  330. if (page) {
  331. /*
  332. * Ordered (Private2) bit indicates whether we still
  333. * have pending io unfinished for the ordered extent.
  334. *
  335. * If there's no such bit, we need to skip to next range.
  336. */
  337. if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
  338. cur += len;
  339. continue;
  340. }
  341. btrfs_page_clear_ordered(fs_info, page, cur, len);
  342. }
  343. /* Now we're fine to update the accounting */
  344. if (unlikely(len > entry->bytes_left)) {
  345. WARN_ON(1);
  346. btrfs_crit(fs_info,
  347. "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
  348. inode->root->root_key.objectid,
  349. btrfs_ino(inode),
  350. entry->file_offset,
  351. entry->num_bytes,
  352. len, entry->bytes_left);
  353. entry->bytes_left = 0;
  354. } else {
  355. entry->bytes_left -= len;
  356. }
  357. if (!uptodate)
  358. set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
  359. /*
  360. * All the IO of the ordered extent is finished, we need to queue
  361. * the finish_func to be executed.
  362. */
  363. if (entry->bytes_left == 0) {
  364. set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
  365. cond_wake_up(&entry->wait);
  366. refcount_inc(&entry->refs);
  367. trace_btrfs_ordered_extent_mark_finished(inode, entry);
  368. spin_unlock_irqrestore(&tree->lock, flags);
  369. btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
  370. btrfs_queue_work(wq, &entry->work);
  371. spin_lock_irqsave(&tree->lock, flags);
  372. }
  373. cur += len;
  374. }
  375. spin_unlock_irqrestore(&tree->lock, flags);
  376. }
  377. /*
  378. * Finish IO for one ordered extent across a given range. The range can only
  379. * contain one ordered extent.
  380. *
  381. * @cached: The cached ordered extent. If not NULL, we can skip the tree
  382. * search and use the ordered extent directly.
  383. * Will be also used to store the finished ordered extent.
  384. * @file_offset: File offset for the finished IO
  385. * @io_size: Length of the finish IO range
  386. *
  387. * Return true if the ordered extent is finished in the range, and update
  388. * @cached.
  389. * Return false otherwise.
  390. *
  391. * NOTE: The range can NOT cross multiple ordered extents.
  392. * Thus caller should ensure the range doesn't cross ordered extents.
  393. */
  394. bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
  395. struct btrfs_ordered_extent **cached,
  396. u64 file_offset, u64 io_size)
  397. {
  398. struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
  399. struct rb_node *node;
  400. struct btrfs_ordered_extent *entry = NULL;
  401. unsigned long flags;
  402. bool finished = false;
  403. spin_lock_irqsave(&tree->lock, flags);
  404. if (cached && *cached) {
  405. entry = *cached;
  406. goto have_entry;
  407. }
  408. node = tree_search(tree, file_offset);
  409. if (!node)
  410. goto out;
  411. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  412. have_entry:
  413. if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
  414. goto out;
  415. if (io_size > entry->bytes_left)
  416. btrfs_crit(inode->root->fs_info,
  417. "bad ordered accounting left %llu size %llu",
  418. entry->bytes_left, io_size);
  419. entry->bytes_left -= io_size;
  420. if (entry->bytes_left == 0) {
  421. /*
  422. * Ensure only one caller can set the flag and finished_ret
  423. * accordingly
  424. */
  425. finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
  426. /* test_and_set_bit implies a barrier */
  427. cond_wake_up_nomb(&entry->wait);
  428. }
  429. out:
  430. if (finished && cached && entry) {
  431. *cached = entry;
  432. refcount_inc(&entry->refs);
  433. trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
  434. }
  435. spin_unlock_irqrestore(&tree->lock, flags);
  436. return finished;
  437. }
  438. /*
  439. * used to drop a reference on an ordered extent. This will free
  440. * the extent if the last reference is dropped
  441. */
  442. void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
  443. {
  444. struct list_head *cur;
  445. struct btrfs_ordered_sum *sum;
  446. trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
  447. if (refcount_dec_and_test(&entry->refs)) {
  448. ASSERT(list_empty(&entry->root_extent_list));
  449. ASSERT(list_empty(&entry->log_list));
  450. ASSERT(RB_EMPTY_NODE(&entry->rb_node));
  451. if (entry->inode)
  452. btrfs_add_delayed_iput(entry->inode);
  453. while (!list_empty(&entry->list)) {
  454. cur = entry->list.next;
  455. sum = list_entry(cur, struct btrfs_ordered_sum, list);
  456. list_del(&sum->list);
  457. kvfree(sum);
  458. }
  459. kmem_cache_free(btrfs_ordered_extent_cache, entry);
  460. }
  461. }
  462. /*
  463. * remove an ordered extent from the tree. No references are dropped
  464. * and waiters are woken up.
  465. */
  466. void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
  467. struct btrfs_ordered_extent *entry)
  468. {
  469. struct btrfs_ordered_inode_tree *tree;
  470. struct btrfs_root *root = btrfs_inode->root;
  471. struct btrfs_fs_info *fs_info = root->fs_info;
  472. struct rb_node *node;
  473. bool pending;
  474. bool freespace_inode;
  475. /*
  476. * If this is a free space inode the thread has not acquired the ordered
  477. * extents lockdep map.
  478. */
  479. freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
  480. btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
  481. /* This is paired with btrfs_add_ordered_extent. */
  482. spin_lock(&btrfs_inode->lock);
  483. btrfs_mod_outstanding_extents(btrfs_inode, -1);
  484. spin_unlock(&btrfs_inode->lock);
  485. if (root != fs_info->tree_root) {
  486. u64 release;
  487. if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
  488. release = entry->disk_num_bytes;
  489. else
  490. release = entry->num_bytes;
  491. btrfs_delalloc_release_metadata(btrfs_inode, release, false);
  492. }
  493. percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
  494. fs_info->delalloc_batch);
  495. tree = &btrfs_inode->ordered_tree;
  496. spin_lock_irq(&tree->lock);
  497. node = &entry->rb_node;
  498. rb_erase(node, &tree->tree);
  499. RB_CLEAR_NODE(node);
  500. if (tree->last == node)
  501. tree->last = NULL;
  502. set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
  503. pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
  504. spin_unlock_irq(&tree->lock);
  505. /*
  506. * The current running transaction is waiting on us, we need to let it
  507. * know that we're complete and wake it up.
  508. */
  509. if (pending) {
  510. struct btrfs_transaction *trans;
  511. /*
  512. * The checks for trans are just a formality, it should be set,
  513. * but if it isn't we don't want to deref/assert under the spin
  514. * lock, so be nice and check if trans is set, but ASSERT() so
  515. * if it isn't set a developer will notice.
  516. */
  517. spin_lock(&fs_info->trans_lock);
  518. trans = fs_info->running_transaction;
  519. if (trans)
  520. refcount_inc(&trans->use_count);
  521. spin_unlock(&fs_info->trans_lock);
  522. ASSERT(trans || BTRFS_FS_ERROR(fs_info));
  523. if (trans) {
  524. if (atomic_dec_and_test(&trans->pending_ordered))
  525. wake_up(&trans->pending_wait);
  526. btrfs_put_transaction(trans);
  527. }
  528. }
  529. btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
  530. spin_lock(&root->ordered_extent_lock);
  531. list_del_init(&entry->root_extent_list);
  532. root->nr_ordered_extents--;
  533. trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
  534. if (!root->nr_ordered_extents) {
  535. spin_lock(&fs_info->ordered_root_lock);
  536. BUG_ON(list_empty(&root->ordered_root));
  537. list_del_init(&root->ordered_root);
  538. spin_unlock(&fs_info->ordered_root_lock);
  539. }
  540. spin_unlock(&root->ordered_extent_lock);
  541. wake_up(&entry->wait);
  542. if (!freespace_inode)
  543. btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
  544. }
  545. static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  546. {
  547. struct btrfs_ordered_extent *ordered;
  548. ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
  549. btrfs_start_ordered_extent(ordered, 1);
  550. complete(&ordered->completion);
  551. }
  552. /*
  553. * wait for all the ordered extents in a root. This is done when balancing
  554. * space between drives.
  555. */
  556. u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
  557. const u64 range_start, const u64 range_len)
  558. {
  559. struct btrfs_fs_info *fs_info = root->fs_info;
  560. LIST_HEAD(splice);
  561. LIST_HEAD(skipped);
  562. LIST_HEAD(works);
  563. struct btrfs_ordered_extent *ordered, *next;
  564. u64 count = 0;
  565. const u64 range_end = range_start + range_len;
  566. mutex_lock(&root->ordered_extent_mutex);
  567. spin_lock(&root->ordered_extent_lock);
  568. list_splice_init(&root->ordered_extents, &splice);
  569. while (!list_empty(&splice) && nr) {
  570. ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
  571. root_extent_list);
  572. if (range_end <= ordered->disk_bytenr ||
  573. ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
  574. list_move_tail(&ordered->root_extent_list, &skipped);
  575. cond_resched_lock(&root->ordered_extent_lock);
  576. continue;
  577. }
  578. list_move_tail(&ordered->root_extent_list,
  579. &root->ordered_extents);
  580. refcount_inc(&ordered->refs);
  581. spin_unlock(&root->ordered_extent_lock);
  582. btrfs_init_work(&ordered->flush_work,
  583. btrfs_run_ordered_extent_work, NULL, NULL);
  584. list_add_tail(&ordered->work_list, &works);
  585. btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
  586. cond_resched();
  587. spin_lock(&root->ordered_extent_lock);
  588. if (nr != U64_MAX)
  589. nr--;
  590. count++;
  591. }
  592. list_splice_tail(&skipped, &root->ordered_extents);
  593. list_splice_tail(&splice, &root->ordered_extents);
  594. spin_unlock(&root->ordered_extent_lock);
  595. list_for_each_entry_safe(ordered, next, &works, work_list) {
  596. list_del_init(&ordered->work_list);
  597. wait_for_completion(&ordered->completion);
  598. btrfs_put_ordered_extent(ordered);
  599. cond_resched();
  600. }
  601. mutex_unlock(&root->ordered_extent_mutex);
  602. return count;
  603. }
  604. void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
  605. const u64 range_start, const u64 range_len)
  606. {
  607. struct btrfs_root *root;
  608. struct list_head splice;
  609. u64 done;
  610. INIT_LIST_HEAD(&splice);
  611. mutex_lock(&fs_info->ordered_operations_mutex);
  612. spin_lock(&fs_info->ordered_root_lock);
  613. list_splice_init(&fs_info->ordered_roots, &splice);
  614. while (!list_empty(&splice) && nr) {
  615. root = list_first_entry(&splice, struct btrfs_root,
  616. ordered_root);
  617. root = btrfs_grab_root(root);
  618. BUG_ON(!root);
  619. list_move_tail(&root->ordered_root,
  620. &fs_info->ordered_roots);
  621. spin_unlock(&fs_info->ordered_root_lock);
  622. done = btrfs_wait_ordered_extents(root, nr,
  623. range_start, range_len);
  624. btrfs_put_root(root);
  625. spin_lock(&fs_info->ordered_root_lock);
  626. if (nr != U64_MAX) {
  627. nr -= done;
  628. }
  629. }
  630. list_splice_tail(&splice, &fs_info->ordered_roots);
  631. spin_unlock(&fs_info->ordered_root_lock);
  632. mutex_unlock(&fs_info->ordered_operations_mutex);
  633. }
  634. /*
  635. * Used to start IO or wait for a given ordered extent to finish.
  636. *
  637. * If wait is one, this effectively waits on page writeback for all the pages
  638. * in the extent, and it waits on the io completion code to insert
  639. * metadata into the btree corresponding to the extent
  640. */
  641. void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
  642. {
  643. u64 start = entry->file_offset;
  644. u64 end = start + entry->num_bytes - 1;
  645. struct btrfs_inode *inode = BTRFS_I(entry->inode);
  646. bool freespace_inode;
  647. trace_btrfs_ordered_extent_start(inode, entry);
  648. /*
  649. * If this is a free space inode do not take the ordered extents lockdep
  650. * map.
  651. */
  652. freespace_inode = btrfs_is_free_space_inode(inode);
  653. /*
  654. * pages in the range can be dirty, clean or writeback. We
  655. * start IO on any dirty ones so the wait doesn't stall waiting
  656. * for the flusher thread to find them
  657. */
  658. if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
  659. filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
  660. if (wait) {
  661. if (!freespace_inode)
  662. btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
  663. wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
  664. &entry->flags));
  665. }
  666. }
  667. /*
  668. * Used to wait on ordered extents across a large range of bytes.
  669. */
  670. int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
  671. {
  672. int ret = 0;
  673. int ret_wb = 0;
  674. u64 end;
  675. u64 orig_end;
  676. struct btrfs_ordered_extent *ordered;
  677. if (start + len < start) {
  678. orig_end = INT_LIMIT(loff_t);
  679. } else {
  680. orig_end = start + len - 1;
  681. if (orig_end > INT_LIMIT(loff_t))
  682. orig_end = INT_LIMIT(loff_t);
  683. }
  684. /* start IO across the range first to instantiate any delalloc
  685. * extents
  686. */
  687. ret = btrfs_fdatawrite_range(inode, start, orig_end);
  688. if (ret)
  689. return ret;
  690. /*
  691. * If we have a writeback error don't return immediately. Wait first
  692. * for any ordered extents that haven't completed yet. This is to make
  693. * sure no one can dirty the same page ranges and call writepages()
  694. * before the ordered extents complete - to avoid failures (-EEXIST)
  695. * when adding the new ordered extents to the ordered tree.
  696. */
  697. ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
  698. end = orig_end;
  699. while (1) {
  700. ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
  701. if (!ordered)
  702. break;
  703. if (ordered->file_offset > orig_end) {
  704. btrfs_put_ordered_extent(ordered);
  705. break;
  706. }
  707. if (ordered->file_offset + ordered->num_bytes <= start) {
  708. btrfs_put_ordered_extent(ordered);
  709. break;
  710. }
  711. btrfs_start_ordered_extent(ordered, 1);
  712. end = ordered->file_offset;
  713. /*
  714. * If the ordered extent had an error save the error but don't
  715. * exit without waiting first for all other ordered extents in
  716. * the range to complete.
  717. */
  718. if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
  719. ret = -EIO;
  720. btrfs_put_ordered_extent(ordered);
  721. if (end == 0 || end == start)
  722. break;
  723. end--;
  724. }
  725. return ret_wb ? ret_wb : ret;
  726. }
  727. /*
  728. * find an ordered extent corresponding to file_offset. return NULL if
  729. * nothing is found, otherwise take a reference on the extent and return it
  730. */
  731. struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
  732. u64 file_offset)
  733. {
  734. struct btrfs_ordered_inode_tree *tree;
  735. struct rb_node *node;
  736. struct btrfs_ordered_extent *entry = NULL;
  737. unsigned long flags;
  738. tree = &inode->ordered_tree;
  739. spin_lock_irqsave(&tree->lock, flags);
  740. node = tree_search(tree, file_offset);
  741. if (!node)
  742. goto out;
  743. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  744. if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
  745. entry = NULL;
  746. if (entry) {
  747. refcount_inc(&entry->refs);
  748. trace_btrfs_ordered_extent_lookup(inode, entry);
  749. }
  750. out:
  751. spin_unlock_irqrestore(&tree->lock, flags);
  752. return entry;
  753. }
  754. /* Since the DIO code tries to lock a wide area we need to look for any ordered
  755. * extents that exist in the range, rather than just the start of the range.
  756. */
  757. struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
  758. struct btrfs_inode *inode, u64 file_offset, u64 len)
  759. {
  760. struct btrfs_ordered_inode_tree *tree;
  761. struct rb_node *node;
  762. struct btrfs_ordered_extent *entry = NULL;
  763. tree = &inode->ordered_tree;
  764. spin_lock_irq(&tree->lock);
  765. node = tree_search(tree, file_offset);
  766. if (!node) {
  767. node = tree_search(tree, file_offset + len);
  768. if (!node)
  769. goto out;
  770. }
  771. while (1) {
  772. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  773. if (range_overlaps(entry, file_offset, len))
  774. break;
  775. if (entry->file_offset >= file_offset + len) {
  776. entry = NULL;
  777. break;
  778. }
  779. entry = NULL;
  780. node = rb_next(node);
  781. if (!node)
  782. break;
  783. }
  784. out:
  785. if (entry) {
  786. refcount_inc(&entry->refs);
  787. trace_btrfs_ordered_extent_lookup_range(inode, entry);
  788. }
  789. spin_unlock_irq(&tree->lock);
  790. return entry;
  791. }
  792. /*
  793. * Adds all ordered extents to the given list. The list ends up sorted by the
  794. * file_offset of the ordered extents.
  795. */
  796. void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
  797. struct list_head *list)
  798. {
  799. struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
  800. struct rb_node *n;
  801. ASSERT(inode_is_locked(&inode->vfs_inode));
  802. spin_lock_irq(&tree->lock);
  803. for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
  804. struct btrfs_ordered_extent *ordered;
  805. ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  806. if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
  807. continue;
  808. ASSERT(list_empty(&ordered->log_list));
  809. list_add_tail(&ordered->log_list, list);
  810. refcount_inc(&ordered->refs);
  811. trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
  812. }
  813. spin_unlock_irq(&tree->lock);
  814. }
  815. /*
  816. * lookup and return any extent before 'file_offset'. NULL is returned
  817. * if none is found
  818. */
  819. struct btrfs_ordered_extent *
  820. btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
  821. {
  822. struct btrfs_ordered_inode_tree *tree;
  823. struct rb_node *node;
  824. struct btrfs_ordered_extent *entry = NULL;
  825. tree = &inode->ordered_tree;
  826. spin_lock_irq(&tree->lock);
  827. node = tree_search(tree, file_offset);
  828. if (!node)
  829. goto out;
  830. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  831. refcount_inc(&entry->refs);
  832. trace_btrfs_ordered_extent_lookup_first(inode, entry);
  833. out:
  834. spin_unlock_irq(&tree->lock);
  835. return entry;
  836. }
  837. /*
  838. * Lookup the first ordered extent that overlaps the range
  839. * [@file_offset, @file_offset + @len).
  840. *
  841. * The difference between this and btrfs_lookup_first_ordered_extent() is
  842. * that this one won't return any ordered extent that does not overlap the range.
  843. * And the difference against btrfs_lookup_ordered_extent() is, this function
  844. * ensures the first ordered extent gets returned.
  845. */
  846. struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
  847. struct btrfs_inode *inode, u64 file_offset, u64 len)
  848. {
  849. struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
  850. struct rb_node *node;
  851. struct rb_node *cur;
  852. struct rb_node *prev;
  853. struct rb_node *next;
  854. struct btrfs_ordered_extent *entry = NULL;
  855. spin_lock_irq(&tree->lock);
  856. node = tree->tree.rb_node;
  857. /*
  858. * Here we don't want to use tree_search() which will use tree->last
  859. * and screw up the search order.
  860. * And __tree_search() can't return the adjacent ordered extents
  861. * either, thus here we do our own search.
  862. */
  863. while (node) {
  864. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  865. if (file_offset < entry->file_offset) {
  866. node = node->rb_left;
  867. } else if (file_offset >= entry_end(entry)) {
  868. node = node->rb_right;
  869. } else {
  870. /*
  871. * Direct hit, got an ordered extent that starts at
  872. * @file_offset
  873. */
  874. goto out;
  875. }
  876. }
  877. if (!entry) {
  878. /* Empty tree */
  879. goto out;
  880. }
  881. cur = &entry->rb_node;
  882. /* We got an entry around @file_offset, check adjacent entries */
  883. if (entry->file_offset < file_offset) {
  884. prev = cur;
  885. next = rb_next(cur);
  886. } else {
  887. prev = rb_prev(cur);
  888. next = cur;
  889. }
  890. if (prev) {
  891. entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
  892. if (range_overlaps(entry, file_offset, len))
  893. goto out;
  894. }
  895. if (next) {
  896. entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
  897. if (range_overlaps(entry, file_offset, len))
  898. goto out;
  899. }
  900. /* No ordered extent in the range */
  901. entry = NULL;
  902. out:
  903. if (entry) {
  904. refcount_inc(&entry->refs);
  905. trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
  906. }
  907. spin_unlock_irq(&tree->lock);
  908. return entry;
  909. }
  910. /*
  911. * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
  912. * ordered extents in it are run to completion.
  913. *
  914. * @inode: Inode whose ordered tree is to be searched
  915. * @start: Beginning of range to flush
  916. * @end: Last byte of range to lock
  917. * @cached_state: If passed, will return the extent state responsible for the
  918. * locked range. It's the caller's responsibility to free the cached state.
  919. *
  920. * This function always returns with the given range locked, ensuring after it's
  921. * called no order extent can be pending.
  922. */
  923. void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
  924. u64 end,
  925. struct extent_state **cached_state)
  926. {
  927. struct btrfs_ordered_extent *ordered;
  928. struct extent_state *cache = NULL;
  929. struct extent_state **cachedp = &cache;
  930. if (cached_state)
  931. cachedp = cached_state;
  932. while (1) {
  933. lock_extent(&inode->io_tree, start, end, cachedp);
  934. ordered = btrfs_lookup_ordered_range(inode, start,
  935. end - start + 1);
  936. if (!ordered) {
  937. /*
  938. * If no external cached_state has been passed then
  939. * decrement the extra ref taken for cachedp since we
  940. * aren't exposing it outside of this function
  941. */
  942. if (!cached_state)
  943. refcount_dec(&cache->refs);
  944. break;
  945. }
  946. unlock_extent(&inode->io_tree, start, end, cachedp);
  947. btrfs_start_ordered_extent(ordered, 1);
  948. btrfs_put_ordered_extent(ordered);
  949. }
  950. }
  951. /*
  952. * Lock the passed range and ensure all pending ordered extents in it are run
  953. * to completion in nowait mode.
  954. *
  955. * Return true if btrfs_lock_ordered_range does not return any extents,
  956. * otherwise false.
  957. */
  958. bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
  959. {
  960. struct btrfs_ordered_extent *ordered;
  961. if (!try_lock_extent(&inode->io_tree, start, end))
  962. return false;
  963. ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
  964. if (!ordered)
  965. return true;
  966. btrfs_put_ordered_extent(ordered);
  967. unlock_extent(&inode->io_tree, start, end, NULL);
  968. return false;
  969. }
  970. static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
  971. u64 len)
  972. {
  973. struct inode *inode = ordered->inode;
  974. struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
  975. u64 file_offset = ordered->file_offset + pos;
  976. u64 disk_bytenr = ordered->disk_bytenr + pos;
  977. unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
  978. /*
  979. * The splitting extent is already counted and will be added again in
  980. * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
  981. */
  982. percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
  983. fs_info->delalloc_batch);
  984. WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
  985. return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
  986. disk_bytenr, len, 0, flags,
  987. ordered->compress_type);
  988. }
  989. int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
  990. u64 post)
  991. {
  992. struct inode *inode = ordered->inode;
  993. struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
  994. struct rb_node *node;
  995. struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  996. int ret = 0;
  997. trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
  998. spin_lock_irq(&tree->lock);
  999. /* Remove from tree once */
  1000. node = &ordered->rb_node;
  1001. rb_erase(node, &tree->tree);
  1002. RB_CLEAR_NODE(node);
  1003. if (tree->last == node)
  1004. tree->last = NULL;
  1005. ordered->file_offset += pre;
  1006. ordered->disk_bytenr += pre;
  1007. ordered->num_bytes -= (pre + post);
  1008. ordered->disk_num_bytes -= (pre + post);
  1009. ordered->bytes_left -= (pre + post);
  1010. /* Re-insert the node */
  1011. node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
  1012. if (node)
  1013. btrfs_panic(fs_info, -EEXIST,
  1014. "zoned: inconsistency in ordered tree at offset %llu",
  1015. ordered->file_offset);
  1016. spin_unlock_irq(&tree->lock);
  1017. if (pre)
  1018. ret = clone_ordered_extent(ordered, 0, pre);
  1019. if (ret == 0 && post)
  1020. ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
  1021. post);
  1022. return ret;
  1023. }
  1024. int __init ordered_data_init(void)
  1025. {
  1026. btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
  1027. sizeof(struct btrfs_ordered_extent), 0,
  1028. SLAB_MEM_SPREAD,
  1029. NULL);
  1030. if (!btrfs_ordered_extent_cache)
  1031. return -ENOMEM;
  1032. return 0;
  1033. }
  1034. void __cold ordered_data_exit(void)
  1035. {
  1036. kmem_cache_destroy(btrfs_ordered_extent_cache);
  1037. }