rotate.c 13 KB


  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /* Handle fileserver selection and rotation.
  3. *
  4. * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  5. * Written by David Howells ([email protected])
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/slab.h>
  9. #include <linux/fs.h>
  10. #include <linux/sched.h>
  11. #include <linux/delay.h>
  12. #include <linux/sched/signal.h>
  13. #include "internal.h"
  14. #include "afs_fs.h"
  15. /*
  16. * Begin iteration through a server list, starting with the vnode's last used
  17. * server if possible, or the last recorded good server if not.
  18. */
  19. static bool afs_start_fs_iteration(struct afs_operation *op,
  20. struct afs_vnode *vnode)
  21. {
  22. struct afs_server *server;
  23. void *cb_server;
  24. int i;
  25. read_lock(&op->volume->servers_lock);
  26. op->server_list = afs_get_serverlist(
  27. rcu_dereference_protected(op->volume->servers,
  28. lockdep_is_held(&op->volume->servers_lock)));
  29. read_unlock(&op->volume->servers_lock);
  30. op->untried = (1UL << op->server_list->nr_servers) - 1;
  31. op->index = READ_ONCE(op->server_list->preferred);
  32. cb_server = vnode->cb_server;
  33. if (cb_server) {
  34. /* See if the vnode's preferred record is still available */
  35. for (i = 0; i < op->server_list->nr_servers; i++) {
  36. server = op->server_list->servers[i].server;
  37. if (server == cb_server) {
  38. op->index = i;
  39. goto found_interest;
  40. }
  41. }
  42. /* If we have a lock outstanding on a server that's no longer
  43. * serving this vnode, then we can't switch to another server
  44. * and have to return an error.
  45. */
  46. if (op->flags & AFS_OPERATION_CUR_ONLY) {
  47. op->error = -ESTALE;
  48. return false;
  49. }
  50. /* Note that the callback promise is effectively broken */
  51. write_seqlock(&vnode->cb_lock);
  52. ASSERTCMP(cb_server, ==, vnode->cb_server);
  53. vnode->cb_server = NULL;
  54. if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  55. vnode->cb_break++;
  56. write_sequnlock(&vnode->cb_lock);
  57. }
  58. found_interest:
  59. return true;
  60. }
  61. /*
  62. * Post volume busy note.
  63. */
  64. static void afs_busy(struct afs_volume *volume, u32 abort_code)
  65. {
  66. const char *m;
  67. switch (abort_code) {
  68. case VOFFLINE: m = "offline"; break;
  69. case VRESTARTING: m = "restarting"; break;
  70. case VSALVAGING: m = "being salvaged"; break;
  71. default: m = "busy"; break;
  72. }
  73. pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
  74. }
  75. /*
  76. * Sleep and retry the operation to the same fileserver.
  77. */
  78. static bool afs_sleep_and_retry(struct afs_operation *op)
  79. {
  80. if (!(op->flags & AFS_OPERATION_UNINTR)) {
  81. msleep_interruptible(1000);
  82. if (signal_pending(current)) {
  83. op->error = -ERESTARTSYS;
  84. return false;
  85. }
  86. } else {
  87. msleep(1000);
  88. }
  89. return true;
  90. }
  91. /*
  92. * Select the fileserver to use. May be called multiple times to rotate
  93. * through the fileservers.
  94. */
  95. bool afs_select_fileserver(struct afs_operation *op)
  96. {
  97. struct afs_addr_list *alist;
  98. struct afs_server *server;
  99. struct afs_vnode *vnode = op->file[0].vnode;
  100. struct afs_error e;
  101. u32 rtt;
  102. int error = op->ac.error, i;
  103. _enter("%lx[%d],%lx[%d],%d,%d",
  104. op->untried, op->index,
  105. op->ac.tried, op->ac.index,
  106. error, op->ac.abort_code);
  107. if (op->flags & AFS_OPERATION_STOP) {
  108. _leave(" = f [stopped]");
  109. return false;
  110. }
  111. op->nr_iterations++;
  112. /* Evaluate the result of the previous operation, if there was one. */
  113. switch (error) {
  114. case SHRT_MAX:
  115. goto start;
  116. case 0:
  117. default:
  118. /* Success or local failure. Stop. */
  119. op->error = error;
  120. op->flags |= AFS_OPERATION_STOP;
  121. _leave(" = f [okay/local %d]", error);
  122. return false;
  123. case -ECONNABORTED:
  124. /* The far side rejected the operation on some grounds. This
  125. * might involve the server being busy or the volume having been moved.
  126. */
  127. switch (op->ac.abort_code) {
  128. case VNOVOL:
  129. /* This fileserver doesn't know about the volume.
  130. * - May indicate that the VL is wrong - retry once and compare
  131. * the results.
  132. * - May indicate that the fileserver couldn't attach to the vol.
  133. */
  134. if (op->flags & AFS_OPERATION_VNOVOL) {
  135. op->error = -EREMOTEIO;
  136. goto next_server;
  137. }
  138. write_lock(&op->volume->servers_lock);
  139. op->server_list->vnovol_mask |= 1 << op->index;
  140. write_unlock(&op->volume->servers_lock);
  141. set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
  142. error = afs_check_volume_status(op->volume, op);
  143. if (error < 0)
  144. goto failed_set_error;
  145. if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
  146. op->error = -ENOMEDIUM;
  147. goto failed;
  148. }
  149. /* If the server list didn't change, then assume that
  150. * it's the fileserver having trouble.
  151. */
  152. if (rcu_access_pointer(op->volume->servers) == op->server_list) {
  153. op->error = -EREMOTEIO;
  154. goto next_server;
  155. }
  156. /* Try again */
  157. op->flags |= AFS_OPERATION_VNOVOL;
  158. _leave(" = t [vnovol]");
  159. return true;
  160. case VSALVAGE: /* TODO: Should this return an error or iterate? */
  161. case VVOLEXISTS:
  162. case VNOSERVICE:
  163. case VONLINE:
  164. case VDISKFULL:
  165. case VOVERQUOTA:
  166. op->error = afs_abort_to_error(op->ac.abort_code);
  167. goto next_server;
  168. case VOFFLINE:
  169. if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
  170. afs_busy(op->volume, op->ac.abort_code);
  171. clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
  172. }
  173. if (op->flags & AFS_OPERATION_NO_VSLEEP) {
  174. op->error = -EADV;
  175. goto failed;
  176. }
  177. if (op->flags & AFS_OPERATION_CUR_ONLY) {
  178. op->error = -ESTALE;
  179. goto failed;
  180. }
  181. goto busy;
  182. case VSALVAGING:
  183. case VRESTARTING:
  184. case VBUSY:
  185. /* Retry after going round all the servers unless we
  186. * have a file lock we need to maintain.
  187. */
  188. if (op->flags & AFS_OPERATION_NO_VSLEEP) {
  189. op->error = -EBUSY;
  190. goto failed;
  191. }
  192. if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
  193. afs_busy(op->volume, op->ac.abort_code);
  194. clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
  195. }
  196. busy:
  197. if (op->flags & AFS_OPERATION_CUR_ONLY) {
  198. if (!afs_sleep_and_retry(op))
  199. goto failed;
  200. /* Retry with same server & address */
  201. _leave(" = t [vbusy]");
  202. return true;
  203. }
  204. op->flags |= AFS_OPERATION_VBUSY;
  205. goto next_server;
  206. case VMOVED:
  207. /* The volume migrated to another server. We consider
  208. * consider all locks and callbacks broken and request
  209. * an update from the VLDB.
  210. *
  211. * We also limit the number of VMOVED hops we will
  212. * honour, just in case someone sets up a loop.
  213. */
  214. if (op->flags & AFS_OPERATION_VMOVED) {
  215. op->error = -EREMOTEIO;
  216. goto failed;
  217. }
  218. op->flags |= AFS_OPERATION_VMOVED;
  219. set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
  220. set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
  221. error = afs_check_volume_status(op->volume, op);
  222. if (error < 0)
  223. goto failed_set_error;
  224. /* If the server list didn't change, then the VLDB is
  225. * out of sync with the fileservers. This is hopefully
  226. * a temporary condition, however, so we don't want to
  227. * permanently block access to the file.
  228. *
  229. * TODO: Try other fileservers if we can.
  230. *
  231. * TODO: Retry a few times with sleeps.
  232. */
  233. if (rcu_access_pointer(op->volume->servers) == op->server_list) {
  234. op->error = -ENOMEDIUM;
  235. goto failed;
  236. }
  237. goto restart_from_beginning;
  238. default:
  239. clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
  240. clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
  241. op->error = afs_abort_to_error(op->ac.abort_code);
  242. goto failed;
  243. }
  244. case -ETIMEDOUT:
  245. case -ETIME:
  246. if (op->error != -EDESTADDRREQ)
  247. goto iterate_address;
  248. fallthrough;
  249. case -ERFKILL:
  250. case -EADDRNOTAVAIL:
  251. case -ENETUNREACH:
  252. case -EHOSTUNREACH:
  253. case -EHOSTDOWN:
  254. case -ECONNREFUSED:
  255. _debug("no conn");
  256. op->error = error;
  257. goto iterate_address;
  258. case -ENETRESET:
  259. pr_warn("kAFS: Peer reset %s (op=%x)\n",
  260. op->type ? op->type->name : "???", op->debug_id);
  261. fallthrough;
  262. case -ECONNRESET:
  263. _debug("call reset");
  264. op->error = error;
  265. goto failed;
  266. }
  267. restart_from_beginning:
  268. _debug("restart");
  269. afs_end_cursor(&op->ac);
  270. op->server = NULL;
  271. afs_put_serverlist(op->net, op->server_list);
  272. op->server_list = NULL;
  273. start:
  274. _debug("start");
  275. /* See if we need to do an update of the volume record. Note that the
  276. * volume may have moved or even have been deleted.
  277. */
  278. error = afs_check_volume_status(op->volume, op);
  279. if (error < 0)
  280. goto failed_set_error;
  281. if (!afs_start_fs_iteration(op, vnode))
  282. goto failed;
  283. _debug("__ VOL %llx __", op->volume->vid);
  284. pick_server:
  285. _debug("pick [%lx]", op->untried);
  286. error = afs_wait_for_fs_probes(op->server_list, op->untried);
  287. if (error < 0)
  288. goto failed_set_error;
  289. /* Pick the untried server with the lowest RTT. If we have outstanding
  290. * callbacks, we stick with the server we're already using if we can.
  291. */
  292. if (op->server) {
  293. _debug("server %u", op->index);
  294. if (test_bit(op->index, &op->untried))
  295. goto selected_server;
  296. op->server = NULL;
  297. _debug("no server");
  298. }
  299. op->index = -1;
  300. rtt = U32_MAX;
  301. for (i = 0; i < op->server_list->nr_servers; i++) {
  302. struct afs_server *s = op->server_list->servers[i].server;
  303. if (!test_bit(i, &op->untried) ||
  304. !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
  305. continue;
  306. if (s->probe.rtt < rtt) {
  307. op->index = i;
  308. rtt = s->probe.rtt;
  309. }
  310. }
  311. if (op->index == -1)
  312. goto no_more_servers;
  313. selected_server:
  314. _debug("use %d", op->index);
  315. __clear_bit(op->index, &op->untried);
  316. /* We're starting on a different fileserver from the list. We need to
  317. * check it, create a callback intercept, find its address list and
  318. * probe its capabilities before we use it.
  319. */
  320. ASSERTCMP(op->ac.alist, ==, NULL);
  321. server = op->server_list->servers[op->index].server;
  322. if (!afs_check_server_record(op, server))
  323. goto failed;
  324. _debug("USING SERVER: %pU", &server->uuid);
  325. op->flags |= AFS_OPERATION_RETRY_SERVER;
  326. op->server = server;
  327. if (vnode->cb_server != server) {
  328. vnode->cb_server = server;
  329. vnode->cb_s_break = server->cb_s_break;
  330. vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
  331. vnode->cb_v_break = vnode->volume->cb_v_break;
  332. clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
  333. }
  334. read_lock(&server->fs_lock);
  335. alist = rcu_dereference_protected(server->addresses,
  336. lockdep_is_held(&server->fs_lock));
  337. afs_get_addrlist(alist);
  338. read_unlock(&server->fs_lock);
  339. retry_server:
  340. memset(&op->ac, 0, sizeof(op->ac));
  341. if (!op->ac.alist)
  342. op->ac.alist = alist;
  343. else
  344. afs_put_addrlist(alist);
  345. op->ac.index = -1;
  346. iterate_address:
  347. ASSERT(op->ac.alist);
  348. /* Iterate over the current server's address list to try and find an
  349. * address on which it will respond to us.
  350. */
  351. if (!afs_iterate_addresses(&op->ac))
  352. goto out_of_addresses;
  353. _debug("address [%u] %u/%u %pISp",
  354. op->index, op->ac.index, op->ac.alist->nr_addrs,
  355. &op->ac.alist->addrs[op->ac.index].transport);
  356. _leave(" = t");
  357. return true;
  358. out_of_addresses:
  359. /* We've now had a failure to respond on all of a server's addresses -
  360. * immediately probe them again and consider retrying the server.
  361. */
  362. afs_probe_fileserver(op->net, op->server);
  363. if (op->flags & AFS_OPERATION_RETRY_SERVER) {
  364. alist = op->ac.alist;
  365. error = afs_wait_for_one_fs_probe(
  366. op->server, !(op->flags & AFS_OPERATION_UNINTR));
  367. switch (error) {
  368. case 0:
  369. op->flags &= ~AFS_OPERATION_RETRY_SERVER;
  370. goto retry_server;
  371. case -ERESTARTSYS:
  372. goto failed_set_error;
  373. case -ETIME:
  374. case -EDESTADDRREQ:
  375. goto next_server;
  376. }
  377. }
  378. next_server:
  379. _debug("next");
  380. afs_end_cursor(&op->ac);
  381. goto pick_server;
  382. no_more_servers:
  383. /* That's all the servers poked to no good effect. Try again if some
  384. * of them were busy.
  385. */
  386. if (op->flags & AFS_OPERATION_VBUSY)
  387. goto restart_from_beginning;
  388. e.error = -EDESTADDRREQ;
  389. e.responded = false;
  390. for (i = 0; i < op->server_list->nr_servers; i++) {
  391. struct afs_server *s = op->server_list->servers[i].server;
  392. afs_prioritise_error(&e, READ_ONCE(s->probe.error),
  393. s->probe.abort_code);
  394. }
  395. error = e.error;
  396. failed_set_error:
  397. op->error = error;
  398. failed:
  399. op->flags |= AFS_OPERATION_STOP;
  400. afs_end_cursor(&op->ac);
  401. _leave(" = f [failed %d]", op->error);
  402. return false;
  403. }
  404. /*
  405. * Dump cursor state in the case of the error being EDESTADDRREQ.
  406. */
  407. void afs_dump_edestaddrreq(const struct afs_operation *op)
  408. {
  409. static int count;
  410. int i;
  411. if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
  412. return;
  413. count++;
  414. rcu_read_lock();
  415. pr_notice("EDESTADDR occurred\n");
  416. pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
  417. op->file[0].cb_break_before,
  418. op->file[1].cb_break_before, op->flags, op->error);
  419. pr_notice("FC: ut=%lx ix=%d ni=%u\n",
  420. op->untried, op->index, op->nr_iterations);
  421. if (op->server_list) {
  422. const struct afs_server_list *sl = op->server_list;
  423. pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
  424. sl->nr_servers, sl->preferred, sl->vnovol_mask);
  425. for (i = 0; i < sl->nr_servers; i++) {
  426. const struct afs_server *s = sl->servers[i].server;
  427. pr_notice("FC: server fl=%lx av=%u %pU\n",
  428. s->flags, s->addr_version, &s->uuid);
  429. if (s->addresses) {
  430. const struct afs_addr_list *a =
  431. rcu_dereference(s->addresses);
  432. pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
  433. a->version,
  434. a->nr_ipv4, a->nr_addrs, a->max_addrs,
  435. a->preferred);
  436. pr_notice("FC: - R=%lx F=%lx\n",
  437. a->responded, a->failed);
  438. if (a == op->ac.alist)
  439. pr_notice("FC: - current\n");
  440. }
  441. }
  442. }
  443. pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
  444. op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
  445. op->ac.responded, op->ac.nr_iterations);
  446. rcu_read_unlock();
  447. }