extent_tree.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2014-2016 Christoph Hellwig.
  4. */
  5. #include <linux/vmalloc.h>
  6. #include "blocklayout.h"
  7. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  8. static inline struct pnfs_block_extent *
  9. ext_node(struct rb_node *node)
  10. {
  11. return rb_entry(node, struct pnfs_block_extent, be_node);
  12. }
  13. static struct pnfs_block_extent *
  14. ext_tree_first(struct rb_root *root)
  15. {
  16. struct rb_node *node = rb_first(root);
  17. return node ? ext_node(node) : NULL;
  18. }
  19. static struct pnfs_block_extent *
  20. ext_tree_prev(struct pnfs_block_extent *be)
  21. {
  22. struct rb_node *node = rb_prev(&be->be_node);
  23. return node ? ext_node(node) : NULL;
  24. }
  25. static struct pnfs_block_extent *
  26. ext_tree_next(struct pnfs_block_extent *be)
  27. {
  28. struct rb_node *node = rb_next(&be->be_node);
  29. return node ? ext_node(node) : NULL;
  30. }
  31. static inline sector_t
  32. ext_f_end(struct pnfs_block_extent *be)
  33. {
  34. return be->be_f_offset + be->be_length;
  35. }
  36. static struct pnfs_block_extent *
  37. __ext_tree_search(struct rb_root *root, sector_t start)
  38. {
  39. struct rb_node *node = root->rb_node;
  40. struct pnfs_block_extent *be = NULL;
  41. while (node) {
  42. be = ext_node(node);
  43. if (start < be->be_f_offset)
  44. node = node->rb_left;
  45. else if (start >= ext_f_end(be))
  46. node = node->rb_right;
  47. else
  48. return be;
  49. }
  50. if (be) {
  51. if (start < be->be_f_offset)
  52. return be;
  53. if (start >= ext_f_end(be))
  54. return ext_tree_next(be);
  55. }
  56. return NULL;
  57. }
  58. static bool
  59. ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  60. {
  61. if (be1->be_state != be2->be_state)
  62. return false;
  63. if (be1->be_device != be2->be_device)
  64. return false;
  65. if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  66. return false;
  67. if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  68. (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  69. return false;
  70. if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  71. be1->be_tag != be2->be_tag)
  72. return false;
  73. return true;
  74. }
  75. static struct pnfs_block_extent *
  76. ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  77. {
  78. struct pnfs_block_extent *left = ext_tree_prev(be);
  79. if (left && ext_can_merge(left, be)) {
  80. left->be_length += be->be_length;
  81. rb_erase(&be->be_node, root);
  82. nfs4_put_deviceid_node(be->be_device);
  83. kfree(be);
  84. return left;
  85. }
  86. return be;
  87. }
  88. static struct pnfs_block_extent *
  89. ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
  90. {
  91. struct pnfs_block_extent *right = ext_tree_next(be);
  92. if (right && ext_can_merge(be, right)) {
  93. be->be_length += right->be_length;
  94. rb_erase(&right->be_node, root);
  95. nfs4_put_deviceid_node(right->be_device);
  96. kfree(right);
  97. }
  98. return be;
  99. }
  100. static void __ext_put_deviceids(struct list_head *head)
  101. {
  102. struct pnfs_block_extent *be, *tmp;
  103. list_for_each_entry_safe(be, tmp, head, be_list) {
  104. nfs4_put_deviceid_node(be->be_device);
  105. kfree(be);
  106. }
  107. }
  108. static void
  109. __ext_tree_insert(struct rb_root *root,
  110. struct pnfs_block_extent *new, bool merge_ok)
  111. {
  112. struct rb_node **p = &root->rb_node, *parent = NULL;
  113. struct pnfs_block_extent *be;
  114. while (*p) {
  115. parent = *p;
  116. be = ext_node(parent);
  117. if (new->be_f_offset < be->be_f_offset) {
  118. if (merge_ok && ext_can_merge(new, be)) {
  119. be->be_f_offset = new->be_f_offset;
  120. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  121. be->be_v_offset = new->be_v_offset;
  122. be->be_length += new->be_length;
  123. be = ext_try_to_merge_left(root, be);
  124. goto free_new;
  125. }
  126. p = &(*p)->rb_left;
  127. } else if (new->be_f_offset >= ext_f_end(be)) {
  128. if (merge_ok && ext_can_merge(be, new)) {
  129. be->be_length += new->be_length;
  130. be = ext_try_to_merge_right(root, be);
  131. goto free_new;
  132. }
  133. p = &(*p)->rb_right;
  134. } else {
  135. BUG();
  136. }
  137. }
  138. rb_link_node(&new->be_node, parent, p);
  139. rb_insert_color(&new->be_node, root);
  140. return;
  141. free_new:
  142. nfs4_put_deviceid_node(new->be_device);
  143. kfree(new);
  144. }
  145. static int
  146. __ext_tree_remove(struct rb_root *root,
  147. sector_t start, sector_t end, struct list_head *tmp)
  148. {
  149. struct pnfs_block_extent *be;
  150. sector_t len1 = 0, len2 = 0;
  151. sector_t orig_v_offset;
  152. sector_t orig_len;
  153. be = __ext_tree_search(root, start);
  154. if (!be)
  155. return 0;
  156. if (be->be_f_offset >= end)
  157. return 0;
  158. orig_v_offset = be->be_v_offset;
  159. orig_len = be->be_length;
  160. if (start > be->be_f_offset)
  161. len1 = start - be->be_f_offset;
  162. if (ext_f_end(be) > end)
  163. len2 = ext_f_end(be) - end;
  164. if (len2 > 0) {
  165. if (len1 > 0) {
  166. struct pnfs_block_extent *new;
  167. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  168. if (!new)
  169. return -ENOMEM;
  170. be->be_length = len1;
  171. new->be_f_offset = end;
  172. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  173. new->be_v_offset =
  174. orig_v_offset + orig_len - len2;
  175. }
  176. new->be_length = len2;
  177. new->be_state = be->be_state;
  178. new->be_tag = be->be_tag;
  179. new->be_device = nfs4_get_deviceid(be->be_device);
  180. __ext_tree_insert(root, new, true);
  181. } else {
  182. be->be_f_offset = end;
  183. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  184. be->be_v_offset =
  185. orig_v_offset + orig_len - len2;
  186. }
  187. be->be_length = len2;
  188. }
  189. } else {
  190. if (len1 > 0) {
  191. be->be_length = len1;
  192. be = ext_tree_next(be);
  193. }
  194. while (be && ext_f_end(be) <= end) {
  195. struct pnfs_block_extent *next = ext_tree_next(be);
  196. rb_erase(&be->be_node, root);
  197. list_add_tail(&be->be_list, tmp);
  198. be = next;
  199. }
  200. if (be && be->be_f_offset < end) {
  201. len1 = ext_f_end(be) - end;
  202. be->be_f_offset = end;
  203. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  204. be->be_v_offset += be->be_length - len1;
  205. be->be_length = len1;
  206. }
  207. }
  208. return 0;
  209. }
  210. int
  211. ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
  212. {
  213. struct pnfs_block_extent *be;
  214. struct rb_root *root;
  215. int err = 0;
  216. switch (new->be_state) {
  217. case PNFS_BLOCK_READWRITE_DATA:
  218. case PNFS_BLOCK_INVALID_DATA:
  219. root = &bl->bl_ext_rw;
  220. break;
  221. case PNFS_BLOCK_READ_DATA:
  222. case PNFS_BLOCK_NONE_DATA:
  223. root = &bl->bl_ext_ro;
  224. break;
  225. default:
  226. dprintk("invalid extent type\n");
  227. return -EINVAL;
  228. }
  229. spin_lock(&bl->bl_ext_lock);
  230. retry:
  231. be = __ext_tree_search(root, new->be_f_offset);
  232. if (!be || be->be_f_offset >= ext_f_end(new)) {
  233. __ext_tree_insert(root, new, true);
  234. } else if (new->be_f_offset >= be->be_f_offset) {
  235. if (ext_f_end(new) <= ext_f_end(be)) {
  236. nfs4_put_deviceid_node(new->be_device);
  237. kfree(new);
  238. } else {
  239. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  240. sector_t diff = new->be_length - new_len;
  241. new->be_f_offset += diff;
  242. new->be_v_offset += diff;
  243. new->be_length = new_len;
  244. goto retry;
  245. }
  246. } else if (ext_f_end(new) <= ext_f_end(be)) {
  247. new->be_length = be->be_f_offset - new->be_f_offset;
  248. __ext_tree_insert(root, new, true);
  249. } else {
  250. struct pnfs_block_extent *split;
  251. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  252. sector_t diff = new->be_length - new_len;
  253. split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
  254. if (!split) {
  255. err = -EINVAL;
  256. goto out;
  257. }
  258. split->be_length = be->be_f_offset - split->be_f_offset;
  259. split->be_device = nfs4_get_deviceid(new->be_device);
  260. __ext_tree_insert(root, split, true);
  261. new->be_f_offset += diff;
  262. new->be_v_offset += diff;
  263. new->be_length = new_len;
  264. goto retry;
  265. }
  266. out:
  267. spin_unlock(&bl->bl_ext_lock);
  268. return err;
  269. }
  270. static bool
  271. __ext_tree_lookup(struct rb_root *root, sector_t isect,
  272. struct pnfs_block_extent *ret)
  273. {
  274. struct rb_node *node;
  275. struct pnfs_block_extent *be;
  276. node = root->rb_node;
  277. while (node) {
  278. be = ext_node(node);
  279. if (isect < be->be_f_offset)
  280. node = node->rb_left;
  281. else if (isect >= ext_f_end(be))
  282. node = node->rb_right;
  283. else {
  284. *ret = *be;
  285. return true;
  286. }
  287. }
  288. return false;
  289. }
  290. bool
  291. ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
  292. struct pnfs_block_extent *ret, bool rw)
  293. {
  294. bool found = false;
  295. spin_lock(&bl->bl_ext_lock);
  296. if (!rw)
  297. found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
  298. if (!found)
  299. found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
  300. spin_unlock(&bl->bl_ext_lock);
  301. return found;
  302. }
  303. int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
  304. sector_t start, sector_t end)
  305. {
  306. int err, err2;
  307. LIST_HEAD(tmp);
  308. spin_lock(&bl->bl_ext_lock);
  309. err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
  310. if (rw) {
  311. err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
  312. if (!err)
  313. err = err2;
  314. }
  315. spin_unlock(&bl->bl_ext_lock);
  316. __ext_put_deviceids(&tmp);
  317. return err;
  318. }
  319. static int
  320. ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
  321. sector_t split)
  322. {
  323. struct pnfs_block_extent *new;
  324. sector_t orig_len = be->be_length;
  325. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  326. if (!new)
  327. return -ENOMEM;
  328. be->be_length = split - be->be_f_offset;
  329. new->be_f_offset = split;
  330. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  331. new->be_v_offset = be->be_v_offset + be->be_length;
  332. new->be_length = orig_len - be->be_length;
  333. new->be_state = be->be_state;
  334. new->be_tag = be->be_tag;
  335. new->be_device = nfs4_get_deviceid(be->be_device);
  336. __ext_tree_insert(root, new, false);
  337. return 0;
  338. }
  339. int
  340. ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
  341. sector_t len, u64 lwb)
  342. {
  343. struct rb_root *root = &bl->bl_ext_rw;
  344. sector_t end = start + len;
  345. struct pnfs_block_extent *be;
  346. int err = 0;
  347. LIST_HEAD(tmp);
  348. spin_lock(&bl->bl_ext_lock);
  349. /*
  350. * First remove all COW extents or holes from written to range.
  351. */
  352. err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
  353. if (err)
  354. goto out;
  355. /*
  356. * Then mark all invalid extents in the range as written to.
  357. */
  358. for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
  359. if (be->be_f_offset >= end)
  360. break;
  361. if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
  362. continue;
  363. if (be->be_f_offset < start) {
  364. struct pnfs_block_extent *left = ext_tree_prev(be);
  365. if (left && ext_can_merge(left, be)) {
  366. sector_t diff = start - be->be_f_offset;
  367. left->be_length += diff;
  368. be->be_f_offset += diff;
  369. be->be_v_offset += diff;
  370. be->be_length -= diff;
  371. } else {
  372. err = ext_tree_split(root, be, start);
  373. if (err)
  374. goto out;
  375. }
  376. }
  377. if (ext_f_end(be) > end) {
  378. struct pnfs_block_extent *right = ext_tree_next(be);
  379. if (right && ext_can_merge(be, right)) {
  380. sector_t diff = end - be->be_f_offset;
  381. be->be_length -= diff;
  382. right->be_f_offset -= diff;
  383. right->be_v_offset -= diff;
  384. right->be_length += diff;
  385. } else {
  386. err = ext_tree_split(root, be, end);
  387. if (err)
  388. goto out;
  389. }
  390. }
  391. if (be->be_f_offset >= start && ext_f_end(be) <= end) {
  392. be->be_tag = EXTENT_WRITTEN;
  393. be = ext_try_to_merge_left(root, be);
  394. be = ext_try_to_merge_right(root, be);
  395. }
  396. }
  397. out:
  398. if (bl->bl_lwb < lwb)
  399. bl->bl_lwb = lwb;
  400. spin_unlock(&bl->bl_ext_lock);
  401. __ext_put_deviceids(&tmp);
  402. return err;
  403. }
  404. static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
  405. {
  406. if (bl->bl_scsi_layout)
  407. return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
  408. else
  409. return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
  410. }
  411. static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
  412. size_t buffer_size)
  413. {
  414. if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
  415. int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
  416. for (i = 0; i < nr_pages; i++)
  417. put_page(arg->layoutupdate_pages[i]);
  418. vfree(arg->start_p);
  419. kfree(arg->layoutupdate_pages);
  420. } else {
  421. put_page(arg->layoutupdate_page);
  422. }
  423. }
  424. static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
  425. {
  426. p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
  427. NFS4_DEVICEID4_SIZE);
  428. p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
  429. p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
  430. p = xdr_encode_hyper(p, 0LL);
  431. *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
  432. return p;
  433. }
  434. static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
  435. {
  436. p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
  437. return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
  438. }
  439. static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
  440. size_t buffer_size, size_t *count, __u64 *lastbyte)
  441. {
  442. struct pnfs_block_extent *be;
  443. int ret = 0;
  444. spin_lock(&bl->bl_ext_lock);
  445. for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
  446. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  447. be->be_tag != EXTENT_WRITTEN)
  448. continue;
  449. (*count)++;
  450. if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
  451. /* keep counting.. */
  452. ret = -ENOSPC;
  453. continue;
  454. }
  455. if (bl->bl_scsi_layout)
  456. p = encode_scsi_range(be, p);
  457. else
  458. p = encode_block_extent(be, p);
  459. be->be_tag = EXTENT_COMMITTING;
  460. }
  461. *lastbyte = bl->bl_lwb - 1;
  462. bl->bl_lwb = 0;
  463. spin_unlock(&bl->bl_ext_lock);
  464. return ret;
  465. }
  466. int
  467. ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
  468. {
  469. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  470. size_t count = 0, buffer_size = PAGE_SIZE;
  471. __be32 *start_p;
  472. int ret;
  473. dprintk("%s enter\n", __func__);
  474. arg->layoutupdate_page = alloc_page(GFP_NOFS);
  475. if (!arg->layoutupdate_page)
  476. return -ENOMEM;
  477. start_p = page_address(arg->layoutupdate_page);
  478. arg->layoutupdate_pages = &arg->layoutupdate_page;
  479. retry:
  480. ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
  481. if (unlikely(ret)) {
  482. ext_tree_free_commitdata(arg, buffer_size);
  483. buffer_size = ext_tree_layoutupdate_size(bl, count);
  484. count = 0;
  485. arg->layoutupdate_pages =
  486. kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
  487. sizeof(struct page *), GFP_NOFS);
  488. if (!arg->layoutupdate_pages)
  489. return -ENOMEM;
  490. start_p = __vmalloc(buffer_size, GFP_NOFS);
  491. if (!start_p) {
  492. kfree(arg->layoutupdate_pages);
  493. return -ENOMEM;
  494. }
  495. goto retry;
  496. }
  497. *start_p = cpu_to_be32(count);
  498. arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
  499. if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
  500. void *p = start_p, *end = p + arg->layoutupdate_len;
  501. struct page *page = NULL;
  502. int i = 0;
  503. arg->start_p = start_p;
  504. for ( ; p < end; p += PAGE_SIZE) {
  505. page = vmalloc_to_page(p);
  506. arg->layoutupdate_pages[i++] = page;
  507. get_page(page);
  508. }
  509. }
  510. dprintk("%s found %zu ranges\n", __func__, count);
  511. return 0;
  512. }
  513. void
  514. ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
  515. {
  516. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  517. struct rb_root *root = &bl->bl_ext_rw;
  518. struct pnfs_block_extent *be;
  519. dprintk("%s status %d\n", __func__, status);
  520. ext_tree_free_commitdata(arg, arg->layoutupdate_len);
  521. spin_lock(&bl->bl_ext_lock);
  522. for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
  523. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  524. be->be_tag != EXTENT_COMMITTING)
  525. continue;
  526. if (status) {
  527. /*
  528. * Mark as written and try again.
  529. *
  530. * XXX: some real error handling here wouldn't hurt..
  531. */
  532. be->be_tag = EXTENT_WRITTEN;
  533. } else {
  534. be->be_state = PNFS_BLOCK_READWRITE_DATA;
  535. be->be_tag = 0;
  536. }
  537. be = ext_try_to_merge_left(root, be);
  538. be = ext_try_to_merge_right(root, be);
  539. }
  540. spin_unlock(&bl->bl_ext_lock);
  541. }