dev-replace.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) STRATO AG 2012. All rights reserved.
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/bio.h>
  7. #include <linux/slab.h>
  8. #include <linux/blkdev.h>
  9. #include <linux/kthread.h>
  10. #include <linux/math64.h>
  11. #include "misc.h"
  12. #include "ctree.h"
  13. #include "extent_map.h"
  14. #include "disk-io.h"
  15. #include "transaction.h"
  16. #include "print-tree.h"
  17. #include "volumes.h"
  18. #include "async-thread.h"
  19. #include "check-integrity.h"
  20. #include "rcu-string.h"
  21. #include "dev-replace.h"
  22. #include "sysfs.h"
  23. #include "zoned.h"
  24. #include "block-group.h"
  25. /*
  26. * Device replace overview
  27. *
  28. * [Objective]
  29. * To copy all extents (both new and on-disk) from source device to target
  30. * device, while still keeping the filesystem read-write.
  31. *
  32. * [Method]
  33. * There are two main methods involved:
  34. *
  35. * - Write duplication
  36. *
  37. * All new writes will be written to both target and source devices, so even
  38. * if replace gets canceled, sources device still contains up-to-date data.
  39. *
  40. * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
  41. * Start: btrfs_dev_replace_start()
  42. * End: btrfs_dev_replace_finishing()
  43. * Content: Latest data/metadata
  44. *
  45. * - Copy existing extents
  46. *
  47. * This happens by re-using scrub facility, as scrub also iterates through
  48. * existing extents from commit root.
  49. *
  50. * Location: scrub_write_block_to_dev_replace() from
  51. * scrub_block_complete()
  52. * Content: Data/meta from commit root.
  53. *
  54. * Due to the content difference, we need to avoid nocow write when dev-replace
  55. * is happening. This is done by marking the block group read-only and waiting
  56. * for NOCOW writes.
  57. *
  58. * After replace is done, the finishing part is done by swapping the target and
  59. * source devices.
  60. *
  61. * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
  62. * btrfs_dev_replace_finishing()
  63. */
  64. static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  65. int scrub_ret);
  66. static int btrfs_dev_replace_kthread(void *data);
  67. int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  68. {
  69. struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
  70. struct btrfs_key key;
  71. struct btrfs_root *dev_root = fs_info->dev_root;
  72. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  73. struct extent_buffer *eb;
  74. int slot;
  75. int ret = 0;
  76. struct btrfs_path *path = NULL;
  77. int item_size;
  78. struct btrfs_dev_replace_item *ptr;
  79. u64 src_devid;
  80. if (!dev_root)
  81. return 0;
  82. path = btrfs_alloc_path();
  83. if (!path) {
  84. ret = -ENOMEM;
  85. goto out;
  86. }
  87. key.objectid = 0;
  88. key.type = BTRFS_DEV_REPLACE_KEY;
  89. key.offset = 0;
  90. ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  91. if (ret) {
  92. no_valid_dev_replace_entry_found:
  93. /*
  94. * We don't have a replace item or it's corrupted. If there is
  95. * a replace target, fail the mount.
  96. */
  97. if (btrfs_find_device(fs_info->fs_devices, &args)) {
  98. btrfs_err(fs_info,
  99. "found replace target device without a valid replace item");
  100. ret = -EUCLEAN;
  101. goto out;
  102. }
  103. ret = 0;
  104. dev_replace->replace_state =
  105. BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
  106. dev_replace->cont_reading_from_srcdev_mode =
  107. BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
  108. dev_replace->time_started = 0;
  109. dev_replace->time_stopped = 0;
  110. atomic64_set(&dev_replace->num_write_errors, 0);
  111. atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  112. dev_replace->cursor_left = 0;
  113. dev_replace->committed_cursor_left = 0;
  114. dev_replace->cursor_left_last_write_of_item = 0;
  115. dev_replace->cursor_right = 0;
  116. dev_replace->srcdev = NULL;
  117. dev_replace->tgtdev = NULL;
  118. dev_replace->is_valid = 0;
  119. dev_replace->item_needs_writeback = 0;
  120. goto out;
  121. }
  122. slot = path->slots[0];
  123. eb = path->nodes[0];
  124. item_size = btrfs_item_size(eb, slot);
  125. ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
  126. if (item_size != sizeof(struct btrfs_dev_replace_item)) {
  127. btrfs_warn(fs_info,
  128. "dev_replace entry found has unexpected size, ignore entry");
  129. goto no_valid_dev_replace_entry_found;
  130. }
  131. src_devid = btrfs_dev_replace_src_devid(eb, ptr);
  132. dev_replace->cont_reading_from_srcdev_mode =
  133. btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
  134. dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
  135. dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
  136. dev_replace->time_stopped =
  137. btrfs_dev_replace_time_stopped(eb, ptr);
  138. atomic64_set(&dev_replace->num_write_errors,
  139. btrfs_dev_replace_num_write_errors(eb, ptr));
  140. atomic64_set(&dev_replace->num_uncorrectable_read_errors,
  141. btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
  142. dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
  143. dev_replace->committed_cursor_left = dev_replace->cursor_left;
  144. dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
  145. dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
  146. dev_replace->is_valid = 1;
  147. dev_replace->item_needs_writeback = 0;
  148. switch (dev_replace->replace_state) {
  149. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  150. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  151. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  152. /*
  153. * We don't have an active replace item but if there is a
  154. * replace target, fail the mount.
  155. */
  156. if (btrfs_find_device(fs_info->fs_devices, &args)) {
  157. btrfs_err(fs_info,
  158. "replace without active item, run 'device scan --forget' on the target device");
  159. ret = -EUCLEAN;
  160. } else {
  161. dev_replace->srcdev = NULL;
  162. dev_replace->tgtdev = NULL;
  163. }
  164. break;
  165. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  166. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  167. dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
  168. args.devid = src_devid;
  169. dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
  170. /*
  171. * allow 'btrfs dev replace_cancel' if src/tgt device is
  172. * missing
  173. */
  174. if (!dev_replace->srcdev &&
  175. !btrfs_test_opt(fs_info, DEGRADED)) {
  176. ret = -EIO;
  177. btrfs_warn(fs_info,
  178. "cannot mount because device replace operation is ongoing and");
  179. btrfs_warn(fs_info,
  180. "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
  181. src_devid);
  182. }
  183. if (!dev_replace->tgtdev &&
  184. !btrfs_test_opt(fs_info, DEGRADED)) {
  185. ret = -EIO;
  186. btrfs_warn(fs_info,
  187. "cannot mount because device replace operation is ongoing and");
  188. btrfs_warn(fs_info,
  189. "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
  190. BTRFS_DEV_REPLACE_DEVID);
  191. }
  192. if (dev_replace->tgtdev) {
  193. if (dev_replace->srcdev) {
  194. dev_replace->tgtdev->total_bytes =
  195. dev_replace->srcdev->total_bytes;
  196. dev_replace->tgtdev->disk_total_bytes =
  197. dev_replace->srcdev->disk_total_bytes;
  198. dev_replace->tgtdev->commit_total_bytes =
  199. dev_replace->srcdev->commit_total_bytes;
  200. dev_replace->tgtdev->bytes_used =
  201. dev_replace->srcdev->bytes_used;
  202. dev_replace->tgtdev->commit_bytes_used =
  203. dev_replace->srcdev->commit_bytes_used;
  204. }
  205. set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
  206. &dev_replace->tgtdev->dev_state);
  207. WARN_ON(fs_info->fs_devices->rw_devices == 0);
  208. dev_replace->tgtdev->io_width = fs_info->sectorsize;
  209. dev_replace->tgtdev->io_align = fs_info->sectorsize;
  210. dev_replace->tgtdev->sector_size = fs_info->sectorsize;
  211. dev_replace->tgtdev->fs_info = fs_info;
  212. set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
  213. &dev_replace->tgtdev->dev_state);
  214. }
  215. break;
  216. }
  217. out:
  218. btrfs_free_path(path);
  219. return ret;
  220. }
  221. /*
  222. * Initialize a new device for device replace target from a given source dev
  223. * and path.
  224. *
  225. * Return 0 and new device in @device_out, otherwise return < 0
  226. */
  227. static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
  228. const char *device_path,
  229. struct btrfs_device *srcdev,
  230. struct btrfs_device **device_out)
  231. {
  232. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  233. struct btrfs_device *device;
  234. struct block_device *bdev;
  235. struct rcu_string *name;
  236. u64 devid = BTRFS_DEV_REPLACE_DEVID;
  237. int ret = 0;
  238. *device_out = NULL;
  239. if (srcdev->fs_devices->seeding) {
  240. btrfs_err(fs_info, "the filesystem is a seed filesystem!");
  241. return -EINVAL;
  242. }
  243. bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
  244. fs_info->bdev_holder);
  245. if (IS_ERR(bdev)) {
  246. btrfs_err(fs_info, "target device %s is invalid!", device_path);
  247. return PTR_ERR(bdev);
  248. }
  249. if (!btrfs_check_device_zone_type(fs_info, bdev)) {
  250. btrfs_err(fs_info,
  251. "dev-replace: zoned type of target device mismatch with filesystem");
  252. ret = -EINVAL;
  253. goto error;
  254. }
  255. sync_blockdev(bdev);
  256. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  257. if (device->bdev == bdev) {
  258. btrfs_err(fs_info,
  259. "target device is in the filesystem!");
  260. ret = -EEXIST;
  261. goto error;
  262. }
  263. }
  264. if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
  265. btrfs_err(fs_info,
  266. "target device is smaller than source device!");
  267. ret = -EINVAL;
  268. goto error;
  269. }
  270. device = btrfs_alloc_device(NULL, &devid, NULL);
  271. if (IS_ERR(device)) {
  272. ret = PTR_ERR(device);
  273. goto error;
  274. }
  275. name = rcu_string_strdup(device_path, GFP_KERNEL);
  276. if (!name) {
  277. btrfs_free_device(device);
  278. ret = -ENOMEM;
  279. goto error;
  280. }
  281. rcu_assign_pointer(device->name, name);
  282. ret = lookup_bdev(device_path, &device->devt);
  283. if (ret)
  284. goto error;
  285. set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  286. device->generation = 0;
  287. device->io_width = fs_info->sectorsize;
  288. device->io_align = fs_info->sectorsize;
  289. device->sector_size = fs_info->sectorsize;
  290. device->total_bytes = btrfs_device_get_total_bytes(srcdev);
  291. device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
  292. device->bytes_used = btrfs_device_get_bytes_used(srcdev);
  293. device->commit_total_bytes = srcdev->commit_total_bytes;
  294. device->commit_bytes_used = device->bytes_used;
  295. device->fs_info = fs_info;
  296. device->bdev = bdev;
  297. set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  298. set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  299. device->mode = FMODE_EXCL;
  300. device->dev_stats_valid = 1;
  301. set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
  302. device->fs_devices = fs_devices;
  303. ret = btrfs_get_dev_zone_info(device, false);
  304. if (ret)
  305. goto error;
  306. mutex_lock(&fs_devices->device_list_mutex);
  307. list_add(&device->dev_list, &fs_devices->devices);
  308. fs_devices->num_devices++;
  309. fs_devices->open_devices++;
  310. mutex_unlock(&fs_devices->device_list_mutex);
  311. *device_out = device;
  312. return 0;
  313. error:
  314. blkdev_put(bdev, FMODE_EXCL);
  315. return ret;
  316. }
  317. /*
  318. * called from commit_transaction. Writes changed device replace state to
  319. * disk.
  320. */
  321. int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
  322. {
  323. struct btrfs_fs_info *fs_info = trans->fs_info;
  324. int ret;
  325. struct btrfs_root *dev_root = fs_info->dev_root;
  326. struct btrfs_path *path;
  327. struct btrfs_key key;
  328. struct extent_buffer *eb;
  329. struct btrfs_dev_replace_item *ptr;
  330. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  331. down_read(&dev_replace->rwsem);
  332. if (!dev_replace->is_valid ||
  333. !dev_replace->item_needs_writeback) {
  334. up_read(&dev_replace->rwsem);
  335. return 0;
  336. }
  337. up_read(&dev_replace->rwsem);
  338. key.objectid = 0;
  339. key.type = BTRFS_DEV_REPLACE_KEY;
  340. key.offset = 0;
  341. path = btrfs_alloc_path();
  342. if (!path) {
  343. ret = -ENOMEM;
  344. goto out;
  345. }
  346. ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
  347. if (ret < 0) {
  348. btrfs_warn(fs_info,
  349. "error %d while searching for dev_replace item!",
  350. ret);
  351. goto out;
  352. }
  353. if (ret == 0 &&
  354. btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
  355. /*
  356. * need to delete old one and insert a new one.
  357. * Since no attempt is made to recover any old state, if the
  358. * dev_replace state is 'running', the data on the target
  359. * drive is lost.
  360. * It would be possible to recover the state: just make sure
  361. * that the beginning of the item is never changed and always
  362. * contains all the essential information. Then read this
  363. * minimal set of information and use it as a base for the
  364. * new state.
  365. */
  366. ret = btrfs_del_item(trans, dev_root, path);
  367. if (ret != 0) {
  368. btrfs_warn(fs_info,
  369. "delete too small dev_replace item failed %d!",
  370. ret);
  371. goto out;
  372. }
  373. ret = 1;
  374. }
  375. if (ret == 1) {
  376. /* need to insert a new item */
  377. btrfs_release_path(path);
  378. ret = btrfs_insert_empty_item(trans, dev_root, path,
  379. &key, sizeof(*ptr));
  380. if (ret < 0) {
  381. btrfs_warn(fs_info,
  382. "insert dev_replace item failed %d!", ret);
  383. goto out;
  384. }
  385. }
  386. eb = path->nodes[0];
  387. ptr = btrfs_item_ptr(eb, path->slots[0],
  388. struct btrfs_dev_replace_item);
  389. down_write(&dev_replace->rwsem);
  390. if (dev_replace->srcdev)
  391. btrfs_set_dev_replace_src_devid(eb, ptr,
  392. dev_replace->srcdev->devid);
  393. else
  394. btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
  395. btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
  396. dev_replace->cont_reading_from_srcdev_mode);
  397. btrfs_set_dev_replace_replace_state(eb, ptr,
  398. dev_replace->replace_state);
  399. btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
  400. btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
  401. btrfs_set_dev_replace_num_write_errors(eb, ptr,
  402. atomic64_read(&dev_replace->num_write_errors));
  403. btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
  404. atomic64_read(&dev_replace->num_uncorrectable_read_errors));
  405. dev_replace->cursor_left_last_write_of_item =
  406. dev_replace->cursor_left;
  407. btrfs_set_dev_replace_cursor_left(eb, ptr,
  408. dev_replace->cursor_left_last_write_of_item);
  409. btrfs_set_dev_replace_cursor_right(eb, ptr,
  410. dev_replace->cursor_right);
  411. dev_replace->item_needs_writeback = 0;
  412. up_write(&dev_replace->rwsem);
  413. btrfs_mark_buffer_dirty(eb);
  414. out:
  415. btrfs_free_path(path);
  416. return ret;
  417. }
  418. static char* btrfs_dev_name(struct btrfs_device *device)
  419. {
  420. if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
  421. return "<missing disk>";
  422. else
  423. return rcu_str_deref(device->name);
  424. }
  425. static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
  426. struct btrfs_device *src_dev)
  427. {
  428. struct btrfs_path *path;
  429. struct btrfs_key key;
  430. struct btrfs_key found_key;
  431. struct btrfs_root *root = fs_info->dev_root;
  432. struct btrfs_dev_extent *dev_extent = NULL;
  433. struct btrfs_block_group *cache;
  434. struct btrfs_trans_handle *trans;
  435. int iter_ret = 0;
  436. int ret = 0;
  437. u64 chunk_offset;
  438. /* Do not use "to_copy" on non zoned filesystem for now */
  439. if (!btrfs_is_zoned(fs_info))
  440. return 0;
  441. mutex_lock(&fs_info->chunk_mutex);
  442. /* Ensure we don't have pending new block group */
  443. spin_lock(&fs_info->trans_lock);
  444. while (fs_info->running_transaction &&
  445. !list_empty(&fs_info->running_transaction->dev_update_list)) {
  446. spin_unlock(&fs_info->trans_lock);
  447. mutex_unlock(&fs_info->chunk_mutex);
  448. trans = btrfs_attach_transaction(root);
  449. if (IS_ERR(trans)) {
  450. ret = PTR_ERR(trans);
  451. mutex_lock(&fs_info->chunk_mutex);
  452. if (ret == -ENOENT) {
  453. spin_lock(&fs_info->trans_lock);
  454. continue;
  455. } else {
  456. goto unlock;
  457. }
  458. }
  459. ret = btrfs_commit_transaction(trans);
  460. mutex_lock(&fs_info->chunk_mutex);
  461. if (ret)
  462. goto unlock;
  463. spin_lock(&fs_info->trans_lock);
  464. }
  465. spin_unlock(&fs_info->trans_lock);
  466. path = btrfs_alloc_path();
  467. if (!path) {
  468. ret = -ENOMEM;
  469. goto unlock;
  470. }
  471. path->reada = READA_FORWARD;
  472. path->search_commit_root = 1;
  473. path->skip_locking = 1;
  474. key.objectid = src_dev->devid;
  475. key.type = BTRFS_DEV_EXTENT_KEY;
  476. key.offset = 0;
  477. btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
  478. struct extent_buffer *leaf = path->nodes[0];
  479. if (found_key.objectid != src_dev->devid)
  480. break;
  481. if (found_key.type != BTRFS_DEV_EXTENT_KEY)
  482. break;
  483. if (found_key.offset < key.offset)
  484. break;
  485. dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
  486. chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
  487. cache = btrfs_lookup_block_group(fs_info, chunk_offset);
  488. if (!cache)
  489. continue;
  490. set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
  491. btrfs_put_block_group(cache);
  492. }
  493. if (iter_ret < 0)
  494. ret = iter_ret;
  495. btrfs_free_path(path);
  496. unlock:
  497. mutex_unlock(&fs_info->chunk_mutex);
  498. return ret;
  499. }
  500. bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
  501. struct btrfs_block_group *cache,
  502. u64 physical)
  503. {
  504. struct btrfs_fs_info *fs_info = cache->fs_info;
  505. struct extent_map *em;
  506. struct map_lookup *map;
  507. u64 chunk_offset = cache->start;
  508. int num_extents, cur_extent;
  509. int i;
  510. /* Do not use "to_copy" on non zoned filesystem for now */
  511. if (!btrfs_is_zoned(fs_info))
  512. return true;
  513. spin_lock(&cache->lock);
  514. if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
  515. spin_unlock(&cache->lock);
  516. return true;
  517. }
  518. spin_unlock(&cache->lock);
  519. em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
  520. ASSERT(!IS_ERR(em));
  521. map = em->map_lookup;
  522. num_extents = 0;
  523. cur_extent = 0;
  524. for (i = 0; i < map->num_stripes; i++) {
  525. /* We have more device extent to copy */
  526. if (srcdev != map->stripes[i].dev)
  527. continue;
  528. num_extents++;
  529. if (physical == map->stripes[i].physical)
  530. cur_extent = i;
  531. }
  532. free_extent_map(em);
  533. if (num_extents > 1 && cur_extent < num_extents - 1) {
  534. /*
  535. * Has more stripes on this device. Keep this block group
  536. * readonly until we finish all the stripes.
  537. */
  538. return false;
  539. }
  540. /* Last stripe on this device */
  541. clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
  542. return true;
  543. }
  544. static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
  545. const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
  546. int read_src)
  547. {
  548. struct btrfs_root *root = fs_info->dev_root;
  549. struct btrfs_trans_handle *trans;
  550. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  551. int ret;
  552. struct btrfs_device *tgt_device = NULL;
  553. struct btrfs_device *src_device = NULL;
  554. src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
  555. srcdev_name);
  556. if (IS_ERR(src_device))
  557. return PTR_ERR(src_device);
  558. if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
  559. btrfs_warn_in_rcu(fs_info,
  560. "cannot replace device %s (devid %llu) due to active swapfile",
  561. btrfs_dev_name(src_device), src_device->devid);
  562. return -ETXTBSY;
  563. }
  564. /*
  565. * Here we commit the transaction to make sure commit_total_bytes
  566. * of all the devices are updated.
  567. */
  568. trans = btrfs_attach_transaction(root);
  569. if (!IS_ERR(trans)) {
  570. ret = btrfs_commit_transaction(trans);
  571. if (ret)
  572. return ret;
  573. } else if (PTR_ERR(trans) != -ENOENT) {
  574. return PTR_ERR(trans);
  575. }
  576. ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
  577. src_device, &tgt_device);
  578. if (ret)
  579. return ret;
  580. ret = mark_block_group_to_copy(fs_info, src_device);
  581. if (ret)
  582. return ret;
  583. down_write(&dev_replace->rwsem);
  584. switch (dev_replace->replace_state) {
  585. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  586. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  587. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  588. break;
  589. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  590. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  591. ASSERT(0);
  592. ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
  593. up_write(&dev_replace->rwsem);
  594. goto leave;
  595. }
  596. dev_replace->cont_reading_from_srcdev_mode = read_src;
  597. dev_replace->srcdev = src_device;
  598. dev_replace->tgtdev = tgt_device;
  599. btrfs_info_in_rcu(fs_info,
  600. "dev_replace from %s (devid %llu) to %s started",
  601. btrfs_dev_name(src_device),
  602. src_device->devid,
  603. rcu_str_deref(tgt_device->name));
  604. /*
  605. * from now on, the writes to the srcdev are all duplicated to
  606. * go to the tgtdev as well (refer to btrfs_map_block()).
  607. */
  608. dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
  609. dev_replace->time_started = ktime_get_real_seconds();
  610. dev_replace->cursor_left = 0;
  611. dev_replace->committed_cursor_left = 0;
  612. dev_replace->cursor_left_last_write_of_item = 0;
  613. dev_replace->cursor_right = 0;
  614. dev_replace->is_valid = 1;
  615. dev_replace->item_needs_writeback = 1;
  616. atomic64_set(&dev_replace->num_write_errors, 0);
  617. atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  618. up_write(&dev_replace->rwsem);
  619. ret = btrfs_sysfs_add_device(tgt_device);
  620. if (ret)
  621. btrfs_err(fs_info, "kobj add dev failed %d", ret);
  622. btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
  623. /*
  624. * Commit dev_replace state and reserve 1 item for it.
  625. * This is crucial to ensure we won't miss copying extents for new block
  626. * groups that are allocated after we started the device replace, and
  627. * must be done after setting up the device replace state.
  628. */
  629. trans = btrfs_start_transaction(root, 1);
  630. if (IS_ERR(trans)) {
  631. ret = PTR_ERR(trans);
  632. down_write(&dev_replace->rwsem);
  633. dev_replace->replace_state =
  634. BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
  635. dev_replace->srcdev = NULL;
  636. dev_replace->tgtdev = NULL;
  637. up_write(&dev_replace->rwsem);
  638. goto leave;
  639. }
  640. ret = btrfs_commit_transaction(trans);
  641. WARN_ON(ret);
  642. /* the disk copy procedure reuses the scrub code */
  643. ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
  644. btrfs_device_get_total_bytes(src_device),
  645. &dev_replace->scrub_progress, 0, 1);
  646. ret = btrfs_dev_replace_finishing(fs_info, ret);
  647. if (ret == -EINPROGRESS)
  648. ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
  649. return ret;
  650. leave:
  651. btrfs_destroy_dev_replace_tgtdev(tgt_device);
  652. return ret;
  653. }
  654. int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
  655. struct btrfs_ioctl_dev_replace_args *args)
  656. {
  657. int ret;
  658. switch (args->start.cont_reading_from_srcdev_mode) {
  659. case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
  660. case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
  661. break;
  662. default:
  663. return -EINVAL;
  664. }
  665. if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
  666. args->start.tgtdev_name[0] == '\0')
  667. return -EINVAL;
  668. ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
  669. args->start.srcdevid,
  670. args->start.srcdev_name,
  671. args->start.cont_reading_from_srcdev_mode);
  672. args->result = ret;
  673. /* don't warn if EINPROGRESS, someone else might be running scrub */
  674. if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
  675. ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
  676. return 0;
  677. return ret;
  678. }
  679. /*
  680. * blocked until all in-flight bios operations are finished.
  681. */
  682. static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
  683. {
  684. set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
  685. wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
  686. &fs_info->dev_replace.bio_counter));
  687. }
  688. /*
  689. * we have removed target device, it is safe to allow new bios request.
  690. */
  691. static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
  692. {
  693. clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
  694. wake_up(&fs_info->dev_replace.replace_wait);
  695. }
  696. /*
  697. * When finishing the device replace, before swapping the source device with the
  698. * target device we must update the chunk allocation state in the target device,
  699. * as it is empty because replace works by directly copying the chunks and not
  700. * through the normal chunk allocation path.
  701. */
  702. static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
  703. struct btrfs_device *tgtdev)
  704. {
  705. struct extent_state *cached_state = NULL;
  706. u64 start = 0;
  707. u64 found_start;
  708. u64 found_end;
  709. int ret = 0;
  710. lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
  711. while (!find_first_extent_bit(&srcdev->alloc_state, start,
  712. &found_start, &found_end,
  713. CHUNK_ALLOCATED, &cached_state)) {
  714. ret = set_extent_bits(&tgtdev->alloc_state, found_start,
  715. found_end, CHUNK_ALLOCATED);
  716. if (ret)
  717. break;
  718. start = found_end + 1;
  719. }
  720. free_extent_state(cached_state);
  721. return ret;
  722. }
  723. static void btrfs_dev_replace_update_device_in_mapping_tree(
  724. struct btrfs_fs_info *fs_info,
  725. struct btrfs_device *srcdev,
  726. struct btrfs_device *tgtdev)
  727. {
  728. struct extent_map_tree *em_tree = &fs_info->mapping_tree;
  729. struct extent_map *em;
  730. struct map_lookup *map;
  731. u64 start = 0;
  732. int i;
  733. write_lock(&em_tree->lock);
  734. do {
  735. em = lookup_extent_mapping(em_tree, start, (u64)-1);
  736. if (!em)
  737. break;
  738. map = em->map_lookup;
  739. for (i = 0; i < map->num_stripes; i++)
  740. if (srcdev == map->stripes[i].dev)
  741. map->stripes[i].dev = tgtdev;
  742. start = em->start + em->len;
  743. free_extent_map(em);
  744. } while (start);
  745. write_unlock(&em_tree->lock);
  746. }
  747. static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  748. int scrub_ret)
  749. {
  750. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  751. struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
  752. struct btrfs_device *tgt_device;
  753. struct btrfs_device *src_device;
  754. struct btrfs_root *root = fs_info->tree_root;
  755. u8 uuid_tmp[BTRFS_UUID_SIZE];
  756. struct btrfs_trans_handle *trans;
  757. int ret = 0;
  758. /* don't allow cancel or unmount to disturb the finishing procedure */
  759. mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
  760. down_read(&dev_replace->rwsem);
  761. /* was the operation canceled, or is it finished? */
  762. if (dev_replace->replace_state !=
  763. BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
  764. up_read(&dev_replace->rwsem);
  765. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  766. return 0;
  767. }
  768. tgt_device = dev_replace->tgtdev;
  769. src_device = dev_replace->srcdev;
  770. up_read(&dev_replace->rwsem);
  771. /*
  772. * flush all outstanding I/O and inode extent mappings before the
  773. * copy operation is declared as being finished
  774. */
  775. ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
  776. if (ret) {
  777. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  778. return ret;
  779. }
  780. btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
  781. /*
  782. * We have to use this loop approach because at this point src_device
  783. * has to be available for transaction commit to complete, yet new
  784. * chunks shouldn't be allocated on the device.
  785. */
  786. while (1) {
  787. trans = btrfs_start_transaction(root, 0);
  788. if (IS_ERR(trans)) {
  789. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  790. return PTR_ERR(trans);
  791. }
  792. ret = btrfs_commit_transaction(trans);
  793. WARN_ON(ret);
  794. /* Prevent write_all_supers() during the finishing procedure */
  795. mutex_lock(&fs_devices->device_list_mutex);
  796. /* Prevent new chunks being allocated on the source device */
  797. mutex_lock(&fs_info->chunk_mutex);
  798. if (!list_empty(&src_device->post_commit_list)) {
  799. mutex_unlock(&fs_devices->device_list_mutex);
  800. mutex_unlock(&fs_info->chunk_mutex);
  801. } else {
  802. break;
  803. }
  804. }
  805. down_write(&dev_replace->rwsem);
  806. dev_replace->replace_state =
  807. scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
  808. : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
  809. dev_replace->tgtdev = NULL;
  810. dev_replace->srcdev = NULL;
  811. dev_replace->time_stopped = ktime_get_real_seconds();
  812. dev_replace->item_needs_writeback = 1;
  813. /*
  814. * Update allocation state in the new device and replace the old device
  815. * with the new one in the mapping tree.
  816. */
  817. if (!scrub_ret) {
  818. scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
  819. if (scrub_ret)
  820. goto error;
  821. btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
  822. src_device,
  823. tgt_device);
  824. } else {
  825. if (scrub_ret != -ECANCELED)
  826. btrfs_err_in_rcu(fs_info,
  827. "btrfs_scrub_dev(%s, %llu, %s) failed %d",
  828. btrfs_dev_name(src_device),
  829. src_device->devid,
  830. rcu_str_deref(tgt_device->name), scrub_ret);
  831. error:
  832. up_write(&dev_replace->rwsem);
  833. mutex_unlock(&fs_info->chunk_mutex);
  834. mutex_unlock(&fs_devices->device_list_mutex);
  835. btrfs_rm_dev_replace_blocked(fs_info);
  836. if (tgt_device)
  837. btrfs_destroy_dev_replace_tgtdev(tgt_device);
  838. btrfs_rm_dev_replace_unblocked(fs_info);
  839. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  840. return scrub_ret;
  841. }
  842. btrfs_info_in_rcu(fs_info,
  843. "dev_replace from %s (devid %llu) to %s finished",
  844. btrfs_dev_name(src_device),
  845. src_device->devid,
  846. rcu_str_deref(tgt_device->name));
  847. clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
  848. tgt_device->devid = src_device->devid;
  849. src_device->devid = BTRFS_DEV_REPLACE_DEVID;
  850. memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
  851. memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
  852. memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
  853. btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
  854. btrfs_device_set_disk_total_bytes(tgt_device,
  855. src_device->disk_total_bytes);
  856. btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
  857. tgt_device->commit_bytes_used = src_device->bytes_used;
  858. btrfs_assign_next_active_device(src_device, tgt_device);
  859. list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
  860. fs_devices->rw_devices++;
  861. up_write(&dev_replace->rwsem);
  862. btrfs_rm_dev_replace_blocked(fs_info);
  863. btrfs_rm_dev_replace_remove_srcdev(src_device);
  864. btrfs_rm_dev_replace_unblocked(fs_info);
  865. /*
  866. * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
  867. * update on-disk dev stats value during commit transaction
  868. */
  869. atomic_inc(&tgt_device->dev_stats_ccnt);
  870. /*
  871. * this is again a consistent state where no dev_replace procedure
  872. * is running, the target device is part of the filesystem, the
  873. * source device is not part of the filesystem anymore and its 1st
  874. * superblock is scratched out so that it is no longer marked to
  875. * belong to this filesystem.
  876. */
  877. mutex_unlock(&fs_info->chunk_mutex);
  878. mutex_unlock(&fs_devices->device_list_mutex);
  879. /* replace the sysfs entry */
  880. btrfs_sysfs_remove_device(src_device);
  881. btrfs_sysfs_update_devid(tgt_device);
  882. if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
  883. btrfs_scratch_superblocks(fs_info, src_device->bdev,
  884. src_device->name->str);
  885. /* write back the superblocks */
  886. trans = btrfs_start_transaction(root, 0);
  887. if (!IS_ERR(trans))
  888. btrfs_commit_transaction(trans);
  889. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  890. btrfs_rm_dev_replace_free_srcdev(src_device);
  891. return 0;
  892. }
  893. /*
  894. * Read progress of device replace status according to the state and last
  895. * stored position. The value format is the same as for
  896. * btrfs_dev_replace::progress_1000
  897. */
  898. static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
  899. {
  900. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  901. u64 ret = 0;
  902. switch (dev_replace->replace_state) {
  903. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  904. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  905. ret = 0;
  906. break;
  907. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  908. ret = 1000;
  909. break;
  910. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  911. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  912. ret = div64_u64(dev_replace->cursor_left,
  913. div_u64(btrfs_device_get_total_bytes(
  914. dev_replace->srcdev), 1000));
  915. break;
  916. }
  917. return ret;
  918. }
  919. void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
  920. struct btrfs_ioctl_dev_replace_args *args)
  921. {
  922. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  923. down_read(&dev_replace->rwsem);
  924. /* even if !dev_replace_is_valid, the values are good enough for
  925. * the replace_status ioctl */
  926. args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
  927. args->status.replace_state = dev_replace->replace_state;
  928. args->status.time_started = dev_replace->time_started;
  929. args->status.time_stopped = dev_replace->time_stopped;
  930. args->status.num_write_errors =
  931. atomic64_read(&dev_replace->num_write_errors);
  932. args->status.num_uncorrectable_read_errors =
  933. atomic64_read(&dev_replace->num_uncorrectable_read_errors);
  934. args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
  935. up_read(&dev_replace->rwsem);
  936. }
  937. int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
  938. {
  939. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  940. struct btrfs_device *tgt_device = NULL;
  941. struct btrfs_device *src_device = NULL;
  942. struct btrfs_trans_handle *trans;
  943. struct btrfs_root *root = fs_info->tree_root;
  944. int result;
  945. int ret;
  946. if (sb_rdonly(fs_info->sb))
  947. return -EROFS;
  948. mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
  949. down_write(&dev_replace->rwsem);
  950. switch (dev_replace->replace_state) {
  951. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  952. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  953. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  954. result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
  955. up_write(&dev_replace->rwsem);
  956. break;
  957. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  958. tgt_device = dev_replace->tgtdev;
  959. src_device = dev_replace->srcdev;
  960. up_write(&dev_replace->rwsem);
  961. ret = btrfs_scrub_cancel(fs_info);
  962. if (ret < 0) {
  963. result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
  964. } else {
  965. result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
  966. /*
  967. * btrfs_dev_replace_finishing() will handle the
  968. * cleanup part
  969. */
  970. btrfs_info_in_rcu(fs_info,
  971. "dev_replace from %s (devid %llu) to %s canceled",
  972. btrfs_dev_name(src_device), src_device->devid,
  973. btrfs_dev_name(tgt_device));
  974. }
  975. break;
  976. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  977. /*
  978. * Scrub doing the replace isn't running so we need to do the
  979. * cleanup step of btrfs_dev_replace_finishing() here
  980. */
  981. result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
  982. tgt_device = dev_replace->tgtdev;
  983. src_device = dev_replace->srcdev;
  984. dev_replace->tgtdev = NULL;
  985. dev_replace->srcdev = NULL;
  986. dev_replace->replace_state =
  987. BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
  988. dev_replace->time_stopped = ktime_get_real_seconds();
  989. dev_replace->item_needs_writeback = 1;
  990. up_write(&dev_replace->rwsem);
  991. /* Scrub for replace must not be running in suspended state */
  992. btrfs_scrub_cancel(fs_info);
  993. trans = btrfs_start_transaction(root, 0);
  994. if (IS_ERR(trans)) {
  995. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  996. return PTR_ERR(trans);
  997. }
  998. ret = btrfs_commit_transaction(trans);
  999. WARN_ON(ret);
  1000. btrfs_info_in_rcu(fs_info,
  1001. "suspended dev_replace from %s (devid %llu) to %s canceled",
  1002. btrfs_dev_name(src_device), src_device->devid,
  1003. btrfs_dev_name(tgt_device));
  1004. if (tgt_device)
  1005. btrfs_destroy_dev_replace_tgtdev(tgt_device);
  1006. break;
  1007. default:
  1008. up_write(&dev_replace->rwsem);
  1009. result = -EINVAL;
  1010. }
  1011. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  1012. return result;
  1013. }
  1014. void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
  1015. {
  1016. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  1017. mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
  1018. down_write(&dev_replace->rwsem);
  1019. switch (dev_replace->replace_state) {
  1020. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  1021. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  1022. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  1023. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  1024. break;
  1025. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  1026. dev_replace->replace_state =
  1027. BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
  1028. dev_replace->time_stopped = ktime_get_real_seconds();
  1029. dev_replace->item_needs_writeback = 1;
  1030. btrfs_info(fs_info, "suspending dev_replace for unmount");
  1031. break;
  1032. }
  1033. up_write(&dev_replace->rwsem);
  1034. mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
  1035. }
  1036. /* resume dev_replace procedure that was interrupted by unmount */
  1037. int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
  1038. {
  1039. struct task_struct *task;
  1040. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  1041. down_write(&dev_replace->rwsem);
  1042. switch (dev_replace->replace_state) {
  1043. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  1044. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  1045. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  1046. up_write(&dev_replace->rwsem);
  1047. return 0;
  1048. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  1049. break;
  1050. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  1051. dev_replace->replace_state =
  1052. BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
  1053. break;
  1054. }
  1055. if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
  1056. btrfs_info(fs_info,
  1057. "cannot continue dev_replace, tgtdev is missing");
  1058. btrfs_info(fs_info,
  1059. "you may cancel the operation after 'mount -o degraded'");
  1060. dev_replace->replace_state =
  1061. BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
  1062. up_write(&dev_replace->rwsem);
  1063. return 0;
  1064. }
  1065. up_write(&dev_replace->rwsem);
  1066. /*
  1067. * This could collide with a paused balance, but the exclusive op logic
  1068. * should never allow both to start and pause. We don't want to allow
  1069. * dev-replace to start anyway.
  1070. */
  1071. if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
  1072. down_write(&dev_replace->rwsem);
  1073. dev_replace->replace_state =
  1074. BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
  1075. up_write(&dev_replace->rwsem);
  1076. btrfs_info(fs_info,
  1077. "cannot resume dev-replace, other exclusive operation running");
  1078. return 0;
  1079. }
  1080. task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
  1081. return PTR_ERR_OR_ZERO(task);
  1082. }
  1083. static int btrfs_dev_replace_kthread(void *data)
  1084. {
  1085. struct btrfs_fs_info *fs_info = data;
  1086. struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  1087. u64 progress;
  1088. int ret;
  1089. progress = btrfs_dev_replace_progress(fs_info);
  1090. progress = div_u64(progress, 10);
  1091. btrfs_info_in_rcu(fs_info,
  1092. "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
  1093. btrfs_dev_name(dev_replace->srcdev),
  1094. dev_replace->srcdev->devid,
  1095. btrfs_dev_name(dev_replace->tgtdev),
  1096. (unsigned int)progress);
  1097. ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
  1098. dev_replace->committed_cursor_left,
  1099. btrfs_device_get_total_bytes(dev_replace->srcdev),
  1100. &dev_replace->scrub_progress, 0, 1);
  1101. ret = btrfs_dev_replace_finishing(fs_info, ret);
  1102. WARN_ON(ret && ret != -ECANCELED);
  1103. btrfs_exclop_finish(fs_info);
  1104. return 0;
  1105. }
  1106. int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
  1107. {
  1108. if (!dev_replace->is_valid)
  1109. return 0;
  1110. switch (dev_replace->replace_state) {
  1111. case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
  1112. case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
  1113. case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
  1114. return 0;
  1115. case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
  1116. case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
  1117. /*
  1118. * return true even if tgtdev is missing (this is
  1119. * something that can happen if the dev_replace
  1120. * procedure is suspended by an umount and then
  1121. * the tgtdev is missing (or "btrfs dev scan") was
  1122. * not called and the filesystem is remounted
  1123. * in degraded state. This does not stop the
  1124. * dev_replace procedure. It needs to be canceled
  1125. * manually if the cancellation is wanted.
  1126. */
  1127. break;
  1128. }
  1129. return 1;
  1130. }
  1131. void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
  1132. {
  1133. percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
  1134. cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
  1135. }
  1136. void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
  1137. {
  1138. while (1) {
  1139. percpu_counter_inc(&fs_info->dev_replace.bio_counter);
  1140. if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
  1141. &fs_info->fs_state)))
  1142. break;
  1143. btrfs_bio_counter_dec(fs_info);
  1144. wait_event(fs_info->dev_replace.replace_wait,
  1145. !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
  1146. &fs_info->fs_state));
  1147. }
  1148. }