lock_dlm.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
  4. * Copyright 2004-2011 Red Hat, Inc.
  5. */
  6. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7. #include <linux/fs.h>
  8. #include <linux/dlm.h>
  9. #include <linux/slab.h>
  10. #include <linux/types.h>
  11. #include <linux/delay.h>
  12. #include <linux/gfs2_ondisk.h>
  13. #include <linux/sched/signal.h>
  14. #include "incore.h"
  15. #include "glock.h"
  16. #include "glops.h"
  17. #include "recovery.h"
  18. #include "util.h"
  19. #include "sys.h"
  20. #include "trace_gfs2.h"
  21. /**
  22. * gfs2_update_stats - Update time based stats
  23. * @s: The stats to update (local or global)
  24. * @index: The index inside @s
  25. * @sample: New data to include
  26. */
  27. static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
  28. s64 sample)
  29. {
  30. /*
  31. * @delta is the difference between the current rtt sample and the
  32. * running average srtt. We add 1/8 of that to the srtt in order to
  33. * update the current srtt estimate. The variance estimate is a bit
  34. * more complicated. We subtract the current variance estimate from
  35. * the abs value of the @delta and add 1/4 of that to the running
  36. * total. That's equivalent to 3/4 of the current variance
  37. * estimate plus 1/4 of the abs of @delta.
  38. *
  39. * Note that the index points at the array entry containing the
  40. * smoothed mean value, and the variance is always in the following
  41. * entry
  42. *
  43. * Reference: TCP/IP Illustrated, vol 2, p. 831,832
  44. * All times are in units of integer nanoseconds. Unlike the TCP/IP
  45. * case, they are not scaled fixed point.
  46. */
  47. s64 delta = sample - s->stats[index];
  48. s->stats[index] += (delta >> 3);
  49. index++;
  50. s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2;
  51. }
  52. /**
  53. * gfs2_update_reply_times - Update locking statistics
  54. * @gl: The glock to update
  55. *
  56. * This assumes that gl->gl_dstamp has been set earlier.
  57. *
  58. * The rtt (lock round trip time) is an estimate of the time
  59. * taken to perform a dlm lock request. We update it on each
  60. * reply from the dlm.
  61. *
  62. * The blocking flag is set on the glock for all dlm requests
  63. * which may potentially block due to lock requests from other nodes.
  64. * DLM requests where the current lock state is exclusive, the
  65. * requested state is null (or unlocked) or where the TRY or
  66. * TRY_1CB flags are set are classified as non-blocking. All
  67. * other DLM requests are counted as (potentially) blocking.
  68. */
  69. static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
  70. {
  71. struct gfs2_pcpu_lkstats *lks;
  72. const unsigned gltype = gl->gl_name.ln_type;
  73. unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ?
  74. GFS2_LKS_SRTTB : GFS2_LKS_SRTT;
  75. s64 rtt;
  76. preempt_disable();
  77. rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
  78. lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
  79. gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */
  80. gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */
  81. preempt_enable();
  82. trace_gfs2_glock_lock_time(gl, rtt);
  83. }
  84. /**
  85. * gfs2_update_request_times - Update locking statistics
  86. * @gl: The glock to update
  87. *
  88. * The irt (lock inter-request times) measures the average time
  89. * between requests to the dlm. It is updated immediately before
  90. * each dlm call.
  91. */
  92. static inline void gfs2_update_request_times(struct gfs2_glock *gl)
  93. {
  94. struct gfs2_pcpu_lkstats *lks;
  95. const unsigned gltype = gl->gl_name.ln_type;
  96. ktime_t dstamp;
  97. s64 irt;
  98. preempt_disable();
  99. dstamp = gl->gl_dstamp;
  100. gl->gl_dstamp = ktime_get_real();
  101. irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
  102. lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
  103. gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */
  104. gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */
  105. preempt_enable();
  106. }
  107. static void gdlm_ast(void *arg)
  108. {
  109. struct gfs2_glock *gl = arg;
  110. unsigned ret = gl->gl_state;
  111. gfs2_update_reply_times(gl);
  112. BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
  113. if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
  114. memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
  115. switch (gl->gl_lksb.sb_status) {
  116. case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
  117. if (gl->gl_ops->go_free)
  118. gl->gl_ops->go_free(gl);
  119. gfs2_glock_free(gl);
  120. return;
  121. case -DLM_ECANCEL: /* Cancel while getting lock */
  122. ret |= LM_OUT_CANCELED;
  123. goto out;
  124. case -EAGAIN: /* Try lock fails */
  125. case -EDEADLK: /* Deadlock detected */
  126. goto out;
  127. case -ETIMEDOUT: /* Canceled due to timeout */
  128. ret |= LM_OUT_ERROR;
  129. goto out;
  130. case 0: /* Success */
  131. break;
  132. default: /* Something unexpected */
  133. BUG();
  134. }
  135. ret = gl->gl_req;
  136. if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
  137. if (gl->gl_req == LM_ST_SHARED)
  138. ret = LM_ST_DEFERRED;
  139. else if (gl->gl_req == LM_ST_DEFERRED)
  140. ret = LM_ST_SHARED;
  141. else
  142. BUG();
  143. }
  144. set_bit(GLF_INITIAL, &gl->gl_flags);
  145. gfs2_glock_complete(gl, ret);
  146. return;
  147. out:
  148. if (!test_bit(GLF_INITIAL, &gl->gl_flags))
  149. gl->gl_lksb.sb_lkid = 0;
  150. gfs2_glock_complete(gl, ret);
  151. }
  152. static void gdlm_bast(void *arg, int mode)
  153. {
  154. struct gfs2_glock *gl = arg;
  155. switch (mode) {
  156. case DLM_LOCK_EX:
  157. gfs2_glock_cb(gl, LM_ST_UNLOCKED);
  158. break;
  159. case DLM_LOCK_CW:
  160. gfs2_glock_cb(gl, LM_ST_DEFERRED);
  161. break;
  162. case DLM_LOCK_PR:
  163. gfs2_glock_cb(gl, LM_ST_SHARED);
  164. break;
  165. default:
  166. fs_err(gl->gl_name.ln_sbd, "unknown bast mode %d\n", mode);
  167. BUG();
  168. }
  169. }
  170. /* convert gfs lock-state to dlm lock-mode */
  171. static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate)
  172. {
  173. switch (lmstate) {
  174. case LM_ST_UNLOCKED:
  175. return DLM_LOCK_NL;
  176. case LM_ST_EXCLUSIVE:
  177. return DLM_LOCK_EX;
  178. case LM_ST_DEFERRED:
  179. return DLM_LOCK_CW;
  180. case LM_ST_SHARED:
  181. return DLM_LOCK_PR;
  182. }
  183. fs_err(sdp, "unknown LM state %d\n", lmstate);
  184. BUG();
  185. return -1;
  186. }
  187. static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
  188. const int req)
  189. {
  190. u32 lkf = 0;
  191. if (gl->gl_lksb.sb_lvbptr)
  192. lkf |= DLM_LKF_VALBLK;
  193. if (gfs_flags & LM_FLAG_TRY)
  194. lkf |= DLM_LKF_NOQUEUE;
  195. if (gfs_flags & LM_FLAG_TRY_1CB) {
  196. lkf |= DLM_LKF_NOQUEUE;
  197. lkf |= DLM_LKF_NOQUEUEBAST;
  198. }
  199. if (gfs_flags & LM_FLAG_PRIORITY) {
  200. lkf |= DLM_LKF_NOORDER;
  201. lkf |= DLM_LKF_HEADQUE;
  202. }
  203. if (gfs_flags & LM_FLAG_ANY) {
  204. if (req == DLM_LOCK_PR)
  205. lkf |= DLM_LKF_ALTCW;
  206. else if (req == DLM_LOCK_CW)
  207. lkf |= DLM_LKF_ALTPR;
  208. else
  209. BUG();
  210. }
  211. if (gl->gl_lksb.sb_lkid != 0) {
  212. lkf |= DLM_LKF_CONVERT;
  213. if (test_bit(GLF_BLOCKING, &gl->gl_flags))
  214. lkf |= DLM_LKF_QUECVT;
  215. }
  216. return lkf;
  217. }
  218. static void gfs2_reverse_hex(char *c, u64 value)
  219. {
  220. *c = '0';
  221. while (value) {
  222. *c-- = hex_asc[value & 0x0f];
  223. value >>= 4;
  224. }
  225. }
  226. static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
  227. unsigned int flags)
  228. {
  229. struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
  230. int req;
  231. u32 lkf;
  232. char strname[GDLM_STRNAME_BYTES] = "";
  233. int error;
  234. req = make_mode(gl->gl_name.ln_sbd, req_state);
  235. lkf = make_flags(gl, flags, req);
  236. gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
  237. gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
  238. if (gl->gl_lksb.sb_lkid) {
  239. gfs2_update_request_times(gl);
  240. } else {
  241. memset(strname, ' ', GDLM_STRNAME_BYTES - 1);
  242. strname[GDLM_STRNAME_BYTES - 1] = '\0';
  243. gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type);
  244. gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number);
  245. gl->gl_dstamp = ktime_get_real();
  246. }
  247. /*
  248. * Submit the actual lock request.
  249. */
  250. again:
  251. error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,
  252. GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
  253. if (error == -EBUSY) {
  254. msleep(20);
  255. goto again;
  256. }
  257. return error;
  258. }
  259. static void gdlm_put_lock(struct gfs2_glock *gl)
  260. {
  261. struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
  262. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  263. int error;
  264. if (gl->gl_lksb.sb_lkid == 0) {
  265. gfs2_glock_free(gl);
  266. return;
  267. }
  268. clear_bit(GLF_BLOCKING, &gl->gl_flags);
  269. gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
  270. gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
  271. gfs2_update_request_times(gl);
  272. /* don't want to call dlm if we've unmounted the lock protocol */
  273. if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
  274. gfs2_glock_free(gl);
  275. return;
  276. }
  277. /* don't want to skip dlm_unlock writing the lvb when lock has one */
  278. if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
  279. !gl->gl_lksb.sb_lvbptr) {
  280. gfs2_glock_free(gl);
  281. return;
  282. }
  283. again:
  284. error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
  285. NULL, gl);
  286. if (error == -EBUSY) {
  287. msleep(20);
  288. goto again;
  289. }
  290. if (error) {
  291. fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
  292. gl->gl_name.ln_type,
  293. (unsigned long long)gl->gl_name.ln_number, error);
  294. return;
  295. }
  296. }
  297. static void gdlm_cancel(struct gfs2_glock *gl)
  298. {
  299. struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
  300. dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
  301. }
  302. /*
  303. * dlm/gfs2 recovery coordination using dlm_recover callbacks
  304. *
  305. * 0. gfs2 checks for another cluster node withdraw, needing journal replay
  306. * 1. dlm_controld sees lockspace members change
  307. * 2. dlm_controld blocks dlm-kernel locking activity
  308. * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
  309. * 4. dlm_controld starts and finishes its own user level recovery
  310. * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
  311. * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
  312. * 7. dlm_recoverd does its own lock recovery
  313. * 8. dlm_recoverd unblocks dlm-kernel locking activity
  314. * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
  315. * 10. gfs2_control updates control_lock lvb with new generation and jid bits
  316. * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
  317. * 12. gfs2_recover dequeues and recovers journals of failed nodes
  318. * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
  319. * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
  320. * 15. gfs2_control unblocks normal locking when all journals are recovered
  321. *
  322. * - failures during recovery
  323. *
  324. * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
  325. * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
  326. * recovering for a prior failure. gfs2_control needs a way to detect
  327. * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
  328. * the recover_block and recover_start values.
  329. *
  330. * recover_done() provides a new lockspace generation number each time it
  331. * is called (step 9). This generation number is saved as recover_start.
  332. * When recover_prep() is called, it sets BLOCK_LOCKS and sets
  333. * recover_block = recover_start. So, while recover_block is equal to
  334. * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
  335. * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
  336. *
  337. * - more specific gfs2 steps in sequence above
  338. *
  339. * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
  340. * 6. recover_slot records any failed jids (maybe none)
  341. * 9. recover_done sets recover_start = new generation number
  342. * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
  343. * 12. gfs2_recover does journal recoveries for failed jids identified above
  344. * 14. gfs2_control clears control_lock lvb bits for recovered jids
  345. * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
  346. * again) then do nothing, otherwise if recover_start > recover_block
  347. * then clear BLOCK_LOCKS.
  348. *
  349. * - parallel recovery steps across all nodes
  350. *
  351. * All nodes attempt to update the control_lock lvb with the new generation
  352. * number and jid bits, but only the first to get the control_lock EX will
  353. * do so; others will see that it's already done (lvb already contains new
  354. * generation number.)
  355. *
  356. * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
  357. * . All nodes attempt to set control_lock lvb gen + bits for the new gen
  358. * . One node gets control_lock first and writes the lvb, others see it's done
  359. * . All nodes attempt to recover jids for which they see control_lock bits set
  360. * . One node succeeds for a jid, and that one clears the jid bit in the lvb
  361. * . All nodes will eventually see all lvb bits clear and unblock locks
  362. *
  363. * - is there a problem with clearing an lvb bit that should be set
  364. * and missing a journal recovery?
  365. *
  366. * 1. jid fails
  367. * 2. lvb bit set for step 1
  368. * 3. jid recovered for step 1
  369. * 4. jid taken again (new mount)
  370. * 5. jid fails (for step 4)
  371. * 6. lvb bit set for step 5 (will already be set)
  372. * 7. lvb bit cleared for step 3
  373. *
  374. * This is not a problem because the failure in step 5 does not
  375. * require recovery, because the mount in step 4 could not have
  376. * progressed far enough to unblock locks and access the fs. The
  377. * control_mount() function waits for all recoveries to be complete
  378. * for the latest lockspace generation before ever unblocking locks
  379. * and returning. The mount in step 4 waits until the recovery in
  380. * step 1 is done.
  381. *
  382. * - special case of first mounter: first node to mount the fs
  383. *
  384. * The first node to mount a gfs2 fs needs to check all the journals
  385. * and recover any that need recovery before other nodes are allowed
  386. * to mount the fs. (Others may begin mounting, but they must wait
  387. * for the first mounter to be done before taking locks on the fs
  388. * or accessing the fs.) This has two parts:
  389. *
  390. * 1. The mounted_lock tells a node it's the first to mount the fs.
  391. * Each node holds the mounted_lock in PR while it's mounted.
  392. * Each node tries to acquire the mounted_lock in EX when it mounts.
  393. * If a node is granted the mounted_lock EX it means there are no
  394. * other mounted nodes (no PR locks exist), and it is the first mounter.
  395. * The mounted_lock is demoted to PR when first recovery is done, so
  396. * others will fail to get an EX lock, but will get a PR lock.
  397. *
  398. * 2. The control_lock blocks others in control_mount() while the first
  399. * mounter is doing first mount recovery of all journals.
  400. * A mounting node needs to acquire control_lock in EX mode before
  401. * it can proceed. The first mounter holds control_lock in EX while doing
  402. * the first mount recovery, blocking mounts from other nodes, then demotes
  403. * control_lock to NL when it's done (others_may_mount/first_done),
  404. * allowing other nodes to continue mounting.
  405. *
  406. * first mounter:
  407. * control_lock EX/NOQUEUE success
  408. * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
  409. * set first=1
  410. * do first mounter recovery
  411. * mounted_lock EX->PR
  412. * control_lock EX->NL, write lvb generation
  413. *
  414. * other mounter:
  415. * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
  416. * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
  417. * mounted_lock PR/NOQUEUE success
  418. * read lvb generation
  419. * control_lock EX->NL
  420. * set first=0
  421. *
  422. * - mount during recovery
  423. *
  424. * If a node mounts while others are doing recovery (not first mounter),
  425. * the mounting node will get its initial recover_done() callback without
  426. * having seen any previous failures/callbacks.
  427. *
  428. * It must wait for all recoveries preceding its mount to be finished
  429. * before it unblocks locks. It does this by repeating the "other mounter"
  430. * steps above until the lvb generation number is >= its mount generation
  431. * number (from initial recover_done) and all lvb bits are clear.
  432. *
  433. * - control_lock lvb format
  434. *
  435. * 4 bytes generation number: the latest dlm lockspace generation number
  436. * from recover_done callback. Indicates the jid bitmap has been updated
  437. * to reflect all slot failures through that generation.
  438. * 4 bytes unused.
  439. * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
  440. * that jid N needs recovery.
  441. */
  442. #define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
  443. static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
  444. char *lvb_bits)
  445. {
  446. __le32 gen;
  447. memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
  448. memcpy(&gen, lvb_bits, sizeof(__le32));
  449. *lvb_gen = le32_to_cpu(gen);
  450. }
  451. static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
  452. char *lvb_bits)
  453. {
  454. __le32 gen;
  455. memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
  456. gen = cpu_to_le32(lvb_gen);
  457. memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
  458. }
  459. static int all_jid_bits_clear(char *lvb)
  460. {
  461. return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0,
  462. GDLM_LVB_SIZE - JID_BITMAP_OFFSET);
  463. }
  464. static void sync_wait_cb(void *arg)
  465. {
  466. struct lm_lockstruct *ls = arg;
  467. complete(&ls->ls_sync_wait);
  468. }
  469. static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
  470. {
  471. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  472. int error;
  473. error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
  474. if (error) {
  475. fs_err(sdp, "%s lkid %x error %d\n",
  476. name, lksb->sb_lkid, error);
  477. return error;
  478. }
  479. wait_for_completion(&ls->ls_sync_wait);
  480. if (lksb->sb_status != -DLM_EUNLOCK) {
  481. fs_err(sdp, "%s lkid %x status %d\n",
  482. name, lksb->sb_lkid, lksb->sb_status);
  483. return -1;
  484. }
  485. return 0;
  486. }
  487. static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
  488. unsigned int num, struct dlm_lksb *lksb, char *name)
  489. {
  490. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  491. char strname[GDLM_STRNAME_BYTES];
  492. int error, status;
  493. memset(strname, 0, GDLM_STRNAME_BYTES);
  494. snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
  495. error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
  496. strname, GDLM_STRNAME_BYTES - 1,
  497. 0, sync_wait_cb, ls, NULL);
  498. if (error) {
  499. fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
  500. name, lksb->sb_lkid, flags, mode, error);
  501. return error;
  502. }
  503. wait_for_completion(&ls->ls_sync_wait);
  504. status = lksb->sb_status;
  505. if (status && status != -EAGAIN) {
  506. fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
  507. name, lksb->sb_lkid, flags, mode, status);
  508. }
  509. return status;
  510. }
  511. static int mounted_unlock(struct gfs2_sbd *sdp)
  512. {
  513. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  514. return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
  515. }
  516. static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
  517. {
  518. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  519. return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
  520. &ls->ls_mounted_lksb, "mounted_lock");
  521. }
  522. static int control_unlock(struct gfs2_sbd *sdp)
  523. {
  524. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  525. return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
  526. }
  527. static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
  528. {
  529. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  530. return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
  531. &ls->ls_control_lksb, "control_lock");
  532. }
  533. /**
  534. * remote_withdraw - react to a node withdrawing from the file system
  535. * @sdp: The superblock
  536. */
  537. static void remote_withdraw(struct gfs2_sbd *sdp)
  538. {
  539. struct gfs2_jdesc *jd;
  540. int ret = 0, count = 0;
  541. list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
  542. if (jd->jd_jid == sdp->sd_lockstruct.ls_jid)
  543. continue;
  544. ret = gfs2_recover_journal(jd, true);
  545. if (ret)
  546. break;
  547. count++;
  548. }
  549. /* Now drop the additional reference we acquired */
  550. fs_err(sdp, "Journals checked: %d, ret = %d.\n", count, ret);
  551. }
  552. static void gfs2_control_func(struct work_struct *work)
  553. {
  554. struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
  555. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  556. uint32_t block_gen, start_gen, lvb_gen, flags;
  557. int recover_set = 0;
  558. int write_lvb = 0;
  559. int recover_size;
  560. int i, error;
  561. /* First check for other nodes that may have done a withdraw. */
  562. if (test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) {
  563. remote_withdraw(sdp);
  564. clear_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags);
  565. return;
  566. }
  567. spin_lock(&ls->ls_recover_spin);
  568. /*
  569. * No MOUNT_DONE means we're still mounting; control_mount()
  570. * will set this flag, after which this thread will take over
  571. * all further clearing of BLOCK_LOCKS.
  572. *
  573. * FIRST_MOUNT means this node is doing first mounter recovery,
  574. * for which recovery control is handled by
  575. * control_mount()/control_first_done(), not this thread.
  576. */
  577. if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
  578. test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
  579. spin_unlock(&ls->ls_recover_spin);
  580. return;
  581. }
  582. block_gen = ls->ls_recover_block;
  583. start_gen = ls->ls_recover_start;
  584. spin_unlock(&ls->ls_recover_spin);
  585. /*
  586. * Equal block_gen and start_gen implies we are between
  587. * recover_prep and recover_done callbacks, which means
  588. * dlm recovery is in progress and dlm locking is blocked.
  589. * There's no point trying to do any work until recover_done.
  590. */
  591. if (block_gen == start_gen)
  592. return;
  593. /*
  594. * Propagate recover_submit[] and recover_result[] to lvb:
  595. * dlm_recoverd adds to recover_submit[] jids needing recovery
  596. * gfs2_recover adds to recover_result[] journal recovery results
  597. *
  598. * set lvb bit for jids in recover_submit[] if the lvb has not
  599. * yet been updated for the generation of the failure
  600. *
  601. * clear lvb bit for jids in recover_result[] if the result of
  602. * the journal recovery is SUCCESS
  603. */
  604. error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
  605. if (error) {
  606. fs_err(sdp, "control lock EX error %d\n", error);
  607. return;
  608. }
  609. control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
  610. spin_lock(&ls->ls_recover_spin);
  611. if (block_gen != ls->ls_recover_block ||
  612. start_gen != ls->ls_recover_start) {
  613. fs_info(sdp, "recover generation %u block1 %u %u\n",
  614. start_gen, block_gen, ls->ls_recover_block);
  615. spin_unlock(&ls->ls_recover_spin);
  616. control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
  617. return;
  618. }
  619. recover_size = ls->ls_recover_size;
  620. if (lvb_gen <= start_gen) {
  621. /*
  622. * Clear lvb bits for jids we've successfully recovered.
  623. * Because all nodes attempt to recover failed journals,
  624. * a journal can be recovered multiple times successfully
  625. * in succession. Only the first will really do recovery,
  626. * the others find it clean, but still report a successful
  627. * recovery. So, another node may have already recovered
  628. * the jid and cleared the lvb bit for it.
  629. */
  630. for (i = 0; i < recover_size; i++) {
  631. if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
  632. continue;
  633. ls->ls_recover_result[i] = 0;
  634. if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET))
  635. continue;
  636. __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
  637. write_lvb = 1;
  638. }
  639. }
  640. if (lvb_gen == start_gen) {
  641. /*
  642. * Failed slots before start_gen are already set in lvb.
  643. */
  644. for (i = 0; i < recover_size; i++) {
  645. if (!ls->ls_recover_submit[i])
  646. continue;
  647. if (ls->ls_recover_submit[i] < lvb_gen)
  648. ls->ls_recover_submit[i] = 0;
  649. }
  650. } else if (lvb_gen < start_gen) {
  651. /*
  652. * Failed slots before start_gen are not yet set in lvb.
  653. */
  654. for (i = 0; i < recover_size; i++) {
  655. if (!ls->ls_recover_submit[i])
  656. continue;
  657. if (ls->ls_recover_submit[i] < start_gen) {
  658. ls->ls_recover_submit[i] = 0;
  659. __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
  660. }
  661. }
  662. /* even if there are no bits to set, we need to write the
  663. latest generation to the lvb */
  664. write_lvb = 1;
  665. } else {
  666. /*
  667. * we should be getting a recover_done() for lvb_gen soon
  668. */
  669. }
  670. spin_unlock(&ls->ls_recover_spin);
  671. if (write_lvb) {
  672. control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
  673. flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
  674. } else {
  675. flags = DLM_LKF_CONVERT;
  676. }
  677. error = control_lock(sdp, DLM_LOCK_NL, flags);
  678. if (error) {
  679. fs_err(sdp, "control lock NL error %d\n", error);
  680. return;
  681. }
  682. /*
  683. * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
  684. * and clear a jid bit in the lvb if the recovery is a success.
  685. * Eventually all journals will be recovered, all jid bits will
  686. * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
  687. */
  688. for (i = 0; i < recover_size; i++) {
  689. if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) {
  690. fs_info(sdp, "recover generation %u jid %d\n",
  691. start_gen, i);
  692. gfs2_recover_set(sdp, i);
  693. recover_set++;
  694. }
  695. }
  696. if (recover_set)
  697. return;
  698. /*
  699. * No more jid bits set in lvb, all recovery is done, unblock locks
  700. * (unless a new recover_prep callback has occured blocking locks
  701. * again while working above)
  702. */
  703. spin_lock(&ls->ls_recover_spin);
  704. if (ls->ls_recover_block == block_gen &&
  705. ls->ls_recover_start == start_gen) {
  706. clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
  707. spin_unlock(&ls->ls_recover_spin);
  708. fs_info(sdp, "recover generation %u done\n", start_gen);
  709. gfs2_glock_thaw(sdp);
  710. } else {
  711. fs_info(sdp, "recover generation %u block2 %u %u\n",
  712. start_gen, block_gen, ls->ls_recover_block);
  713. spin_unlock(&ls->ls_recover_spin);
  714. }
  715. }
  716. static int control_mount(struct gfs2_sbd *sdp)
  717. {
  718. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  719. uint32_t start_gen, block_gen, mount_gen, lvb_gen;
  720. int mounted_mode;
  721. int retries = 0;
  722. int error;
  723. memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
  724. memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
  725. memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
  726. ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
  727. init_completion(&ls->ls_sync_wait);
  728. set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
  729. error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
  730. if (error) {
  731. fs_err(sdp, "control_mount control_lock NL error %d\n", error);
  732. return error;
  733. }
  734. error = mounted_lock(sdp, DLM_LOCK_NL, 0);
  735. if (error) {
  736. fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
  737. control_unlock(sdp);
  738. return error;
  739. }
  740. mounted_mode = DLM_LOCK_NL;
  741. restart:
  742. if (retries++ && signal_pending(current)) {
  743. error = -EINTR;
  744. goto fail;
  745. }
  746. /*
  747. * We always start with both locks in NL. control_lock is
  748. * demoted to NL below so we don't need to do it here.
  749. */
  750. if (mounted_mode != DLM_LOCK_NL) {
  751. error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
  752. if (error)
  753. goto fail;
  754. mounted_mode = DLM_LOCK_NL;
  755. }
  756. /*
  757. * Other nodes need to do some work in dlm recovery and gfs2_control
  758. * before the recover_done and control_lock will be ready for us below.
  759. * A delay here is not required but often avoids having to retry.
  760. */
  761. msleep_interruptible(500);
  762. /*
  763. * Acquire control_lock in EX and mounted_lock in either EX or PR.
  764. * control_lock lvb keeps track of any pending journal recoveries.
  765. * mounted_lock indicates if any other nodes have the fs mounted.
  766. */
  767. error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
  768. if (error == -EAGAIN) {
  769. goto restart;
  770. } else if (error) {
  771. fs_err(sdp, "control_mount control_lock EX error %d\n", error);
  772. goto fail;
  773. }
  774. /**
  775. * If we're a spectator, we don't want to take the lock in EX because
  776. * we cannot do the first-mount responsibility it implies: recovery.
  777. */
  778. if (sdp->sd_args.ar_spectator)
  779. goto locks_done;
  780. error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
  781. if (!error) {
  782. mounted_mode = DLM_LOCK_EX;
  783. goto locks_done;
  784. } else if (error != -EAGAIN) {
  785. fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
  786. goto fail;
  787. }
  788. error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
  789. if (!error) {
  790. mounted_mode = DLM_LOCK_PR;
  791. goto locks_done;
  792. } else {
  793. /* not even -EAGAIN should happen here */
  794. fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
  795. goto fail;
  796. }
  797. locks_done:
  798. /*
  799. * If we got both locks above in EX, then we're the first mounter.
  800. * If not, then we need to wait for the control_lock lvb to be
  801. * updated by other mounted nodes to reflect our mount generation.
  802. *
  803. * In simple first mounter cases, first mounter will see zero lvb_gen,
  804. * but in cases where all existing nodes leave/fail before mounting
  805. * nodes finish control_mount, then all nodes will be mounting and
  806. * lvb_gen will be non-zero.
  807. */
  808. control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
  809. if (lvb_gen == 0xFFFFFFFF) {
  810. /* special value to force mount attempts to fail */
  811. fs_err(sdp, "control_mount control_lock disabled\n");
  812. error = -EINVAL;
  813. goto fail;
  814. }
  815. if (mounted_mode == DLM_LOCK_EX) {
  816. /* first mounter, keep both EX while doing first recovery */
  817. spin_lock(&ls->ls_recover_spin);
  818. clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
  819. set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
  820. set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
  821. spin_unlock(&ls->ls_recover_spin);
  822. fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
  823. return 0;
  824. }
  825. error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
  826. if (error)
  827. goto fail;
  828. /*
  829. * We are not first mounter, now we need to wait for the control_lock
  830. * lvb generation to be >= the generation from our first recover_done
  831. * and all lvb bits to be clear (no pending journal recoveries.)
  832. */
  833. if (!all_jid_bits_clear(ls->ls_lvb_bits)) {
  834. /* journals need recovery, wait until all are clear */
  835. fs_info(sdp, "control_mount wait for journal recovery\n");
  836. goto restart;
  837. }
  838. spin_lock(&ls->ls_recover_spin);
  839. block_gen = ls->ls_recover_block;
  840. start_gen = ls->ls_recover_start;
  841. mount_gen = ls->ls_recover_mount;
  842. if (lvb_gen < mount_gen) {
  843. /* wait for mounted nodes to update control_lock lvb to our
  844. generation, which might include new recovery bits set */
  845. if (sdp->sd_args.ar_spectator) {
  846. fs_info(sdp, "Recovery is required. Waiting for a "
  847. "non-spectator to mount.\n");
  848. msleep_interruptible(1000);
  849. } else {
  850. fs_info(sdp, "control_mount wait1 block %u start %u "
  851. "mount %u lvb %u flags %lx\n", block_gen,
  852. start_gen, mount_gen, lvb_gen,
  853. ls->ls_recover_flags);
  854. }
  855. spin_unlock(&ls->ls_recover_spin);
  856. goto restart;
  857. }
  858. if (lvb_gen != start_gen) {
  859. /* wait for mounted nodes to update control_lock lvb to the
  860. latest recovery generation */
  861. fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
  862. "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
  863. lvb_gen, ls->ls_recover_flags);
  864. spin_unlock(&ls->ls_recover_spin);
  865. goto restart;
  866. }
  867. if (block_gen == start_gen) {
  868. /* dlm recovery in progress, wait for it to finish */
  869. fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
  870. "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
  871. lvb_gen, ls->ls_recover_flags);
  872. spin_unlock(&ls->ls_recover_spin);
  873. goto restart;
  874. }
  875. clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
  876. set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
  877. memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
  878. memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
  879. spin_unlock(&ls->ls_recover_spin);
  880. return 0;
  881. fail:
  882. mounted_unlock(sdp);
  883. control_unlock(sdp);
  884. return error;
  885. }
  886. static int control_first_done(struct gfs2_sbd *sdp)
  887. {
  888. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  889. uint32_t start_gen, block_gen;
  890. int error;
  891. restart:
  892. spin_lock(&ls->ls_recover_spin);
  893. start_gen = ls->ls_recover_start;
  894. block_gen = ls->ls_recover_block;
  895. if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
  896. !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
  897. !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
  898. /* sanity check, should not happen */
  899. fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
  900. start_gen, block_gen, ls->ls_recover_flags);
  901. spin_unlock(&ls->ls_recover_spin);
  902. control_unlock(sdp);
  903. return -1;
  904. }
  905. if (start_gen == block_gen) {
  906. /*
  907. * Wait for the end of a dlm recovery cycle to switch from
  908. * first mounter recovery. We can ignore any recover_slot
  909. * callbacks between the recover_prep and next recover_done
  910. * because we are still the first mounter and any failed nodes
  911. * have not fully mounted, so they don't need recovery.
  912. */
  913. spin_unlock(&ls->ls_recover_spin);
  914. fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
  915. wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
  916. TASK_UNINTERRUPTIBLE);
  917. goto restart;
  918. }
  919. clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
  920. set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
  921. memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
  922. memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
  923. spin_unlock(&ls->ls_recover_spin);
  924. memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE);
  925. control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
  926. error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
  927. if (error)
  928. fs_err(sdp, "control_first_done mounted PR error %d\n", error);
  929. error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
  930. if (error)
  931. fs_err(sdp, "control_first_done control NL error %d\n", error);
  932. return error;
  933. }
  934. /*
  935. * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
  936. * to accommodate the largest slot number. (NB dlm slot numbers start at 1,
  937. * gfs2 jids start at 0, so jid = slot - 1)
  938. */
  939. #define RECOVER_SIZE_INC 16
  940. static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
  941. int num_slots)
  942. {
  943. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  944. uint32_t *submit = NULL;
  945. uint32_t *result = NULL;
  946. uint32_t old_size, new_size;
  947. int i, max_jid;
  948. if (!ls->ls_lvb_bits) {
  949. ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
  950. if (!ls->ls_lvb_bits)
  951. return -ENOMEM;
  952. }
  953. max_jid = 0;
  954. for (i = 0; i < num_slots; i++) {
  955. if (max_jid < slots[i].slot - 1)
  956. max_jid = slots[i].slot - 1;
  957. }
  958. old_size = ls->ls_recover_size;
  959. new_size = old_size;
  960. while (new_size < max_jid + 1)
  961. new_size += RECOVER_SIZE_INC;
  962. if (new_size == old_size)
  963. return 0;
  964. submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
  965. result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
  966. if (!submit || !result) {
  967. kfree(submit);
  968. kfree(result);
  969. return -ENOMEM;
  970. }
  971. spin_lock(&ls->ls_recover_spin);
  972. memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
  973. memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
  974. kfree(ls->ls_recover_submit);
  975. kfree(ls->ls_recover_result);
  976. ls->ls_recover_submit = submit;
  977. ls->ls_recover_result = result;
  978. ls->ls_recover_size = new_size;
  979. spin_unlock(&ls->ls_recover_spin);
  980. return 0;
  981. }
  982. static void free_recover_size(struct lm_lockstruct *ls)
  983. {
  984. kfree(ls->ls_lvb_bits);
  985. kfree(ls->ls_recover_submit);
  986. kfree(ls->ls_recover_result);
  987. ls->ls_recover_submit = NULL;
  988. ls->ls_recover_result = NULL;
  989. ls->ls_recover_size = 0;
  990. ls->ls_lvb_bits = NULL;
  991. }
  992. /* dlm calls before it does lock recovery */
  993. static void gdlm_recover_prep(void *arg)
  994. {
  995. struct gfs2_sbd *sdp = arg;
  996. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  997. if (gfs2_withdrawn(sdp)) {
  998. fs_err(sdp, "recover_prep ignored due to withdraw.\n");
  999. return;
  1000. }
  1001. spin_lock(&ls->ls_recover_spin);
  1002. ls->ls_recover_block = ls->ls_recover_start;
  1003. set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
  1004. if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
  1005. test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
  1006. spin_unlock(&ls->ls_recover_spin);
  1007. return;
  1008. }
  1009. set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
  1010. spin_unlock(&ls->ls_recover_spin);
  1011. }
  1012. /* dlm calls after recover_prep has been completed on all lockspace members;
  1013. identifies slot/jid of failed member */
  1014. static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
  1015. {
  1016. struct gfs2_sbd *sdp = arg;
  1017. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1018. int jid = slot->slot - 1;
  1019. if (gfs2_withdrawn(sdp)) {
  1020. fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
  1021. jid);
  1022. return;
  1023. }
  1024. spin_lock(&ls->ls_recover_spin);
  1025. if (ls->ls_recover_size < jid + 1) {
  1026. fs_err(sdp, "recover_slot jid %d gen %u short size %d\n",
  1027. jid, ls->ls_recover_block, ls->ls_recover_size);
  1028. spin_unlock(&ls->ls_recover_spin);
  1029. return;
  1030. }
  1031. if (ls->ls_recover_submit[jid]) {
  1032. fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
  1033. jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
  1034. }
  1035. ls->ls_recover_submit[jid] = ls->ls_recover_block;
  1036. spin_unlock(&ls->ls_recover_spin);
  1037. }
  1038. /* dlm calls after recover_slot and after it completes lock recovery */
  1039. static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
  1040. int our_slot, uint32_t generation)
  1041. {
  1042. struct gfs2_sbd *sdp = arg;
  1043. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1044. if (gfs2_withdrawn(sdp)) {
  1045. fs_err(sdp, "recover_done ignored due to withdraw.\n");
  1046. return;
  1047. }
  1048. /* ensure the ls jid arrays are large enough */
  1049. set_recover_size(sdp, slots, num_slots);
  1050. spin_lock(&ls->ls_recover_spin);
  1051. ls->ls_recover_start = generation;
  1052. if (!ls->ls_recover_mount) {
  1053. ls->ls_recover_mount = generation;
  1054. ls->ls_jid = our_slot - 1;
  1055. }
  1056. if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
  1057. queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
  1058. clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
  1059. smp_mb__after_atomic();
  1060. wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
  1061. spin_unlock(&ls->ls_recover_spin);
  1062. }
  1063. /* gfs2_recover thread has a journal recovery result */
  1064. static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
  1065. unsigned int result)
  1066. {
  1067. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1068. if (gfs2_withdrawn(sdp)) {
  1069. fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
  1070. jid);
  1071. return;
  1072. }
  1073. if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
  1074. return;
  1075. /* don't care about the recovery of own journal during mount */
  1076. if (jid == ls->ls_jid)
  1077. return;
  1078. spin_lock(&ls->ls_recover_spin);
  1079. if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
  1080. spin_unlock(&ls->ls_recover_spin);
  1081. return;
  1082. }
  1083. if (ls->ls_recover_size < jid + 1) {
  1084. fs_err(sdp, "recovery_result jid %d short size %d\n",
  1085. jid, ls->ls_recover_size);
  1086. spin_unlock(&ls->ls_recover_spin);
  1087. return;
  1088. }
  1089. fs_info(sdp, "recover jid %d result %s\n", jid,
  1090. result == LM_RD_GAVEUP ? "busy" : "success");
  1091. ls->ls_recover_result[jid] = result;
  1092. /* GAVEUP means another node is recovering the journal; delay our
  1093. next attempt to recover it, to give the other node a chance to
  1094. finish before trying again */
  1095. if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
  1096. queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
  1097. result == LM_RD_GAVEUP ? HZ : 0);
  1098. spin_unlock(&ls->ls_recover_spin);
  1099. }
  1100. static const struct dlm_lockspace_ops gdlm_lockspace_ops = {
  1101. .recover_prep = gdlm_recover_prep,
  1102. .recover_slot = gdlm_recover_slot,
  1103. .recover_done = gdlm_recover_done,
  1104. };
  1105. static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
  1106. {
  1107. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1108. char cluster[GFS2_LOCKNAME_LEN];
  1109. const char *fsname;
  1110. uint32_t flags;
  1111. int error, ops_result;
  1112. /*
  1113. * initialize everything
  1114. */
  1115. INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
  1116. spin_lock_init(&ls->ls_recover_spin);
  1117. ls->ls_recover_flags = 0;
  1118. ls->ls_recover_mount = 0;
  1119. ls->ls_recover_start = 0;
  1120. ls->ls_recover_block = 0;
  1121. ls->ls_recover_size = 0;
  1122. ls->ls_recover_submit = NULL;
  1123. ls->ls_recover_result = NULL;
  1124. ls->ls_lvb_bits = NULL;
  1125. error = set_recover_size(sdp, NULL, 0);
  1126. if (error)
  1127. goto fail;
  1128. /*
  1129. * prepare dlm_new_lockspace args
  1130. */
  1131. fsname = strchr(table, ':');
  1132. if (!fsname) {
  1133. fs_info(sdp, "no fsname found\n");
  1134. error = -EINVAL;
  1135. goto fail_free;
  1136. }
  1137. memset(cluster, 0, sizeof(cluster));
  1138. memcpy(cluster, table, strlen(table) - strlen(fsname));
  1139. fsname++;
  1140. flags = DLM_LSFL_NEWEXCL;
  1141. /*
  1142. * create/join lockspace
  1143. */
  1144. error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
  1145. &gdlm_lockspace_ops, sdp, &ops_result,
  1146. &ls->ls_dlm);
  1147. if (error) {
  1148. fs_err(sdp, "dlm_new_lockspace error %d\n", error);
  1149. goto fail_free;
  1150. }
  1151. if (ops_result < 0) {
  1152. /*
  1153. * dlm does not support ops callbacks,
  1154. * old dlm_controld/gfs_controld are used, try without ops.
  1155. */
  1156. fs_info(sdp, "dlm lockspace ops not used\n");
  1157. free_recover_size(ls);
  1158. set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
  1159. return 0;
  1160. }
  1161. if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
  1162. fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
  1163. error = -EINVAL;
  1164. goto fail_release;
  1165. }
  1166. /*
  1167. * control_mount() uses control_lock to determine first mounter,
  1168. * and for later mounts, waits for any recoveries to be cleared.
  1169. */
  1170. error = control_mount(sdp);
  1171. if (error) {
  1172. fs_err(sdp, "mount control error %d\n", error);
  1173. goto fail_release;
  1174. }
  1175. ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
  1176. clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
  1177. smp_mb__after_atomic();
  1178. wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
  1179. return 0;
  1180. fail_release:
  1181. dlm_release_lockspace(ls->ls_dlm, 2);
  1182. fail_free:
  1183. free_recover_size(ls);
  1184. fail:
  1185. return error;
  1186. }
  1187. static void gdlm_first_done(struct gfs2_sbd *sdp)
  1188. {
  1189. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1190. int error;
  1191. if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
  1192. return;
  1193. error = control_first_done(sdp);
  1194. if (error)
  1195. fs_err(sdp, "mount first_done error %d\n", error);
  1196. }
  1197. static void gdlm_unmount(struct gfs2_sbd *sdp)
  1198. {
  1199. struct lm_lockstruct *ls = &sdp->sd_lockstruct;
  1200. if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
  1201. goto release;
  1202. /* wait for gfs2_control_wq to be done with this mount */
  1203. spin_lock(&ls->ls_recover_spin);
  1204. set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
  1205. spin_unlock(&ls->ls_recover_spin);
  1206. flush_delayed_work(&sdp->sd_control_work);
  1207. /* mounted_lock and control_lock will be purged in dlm recovery */
  1208. release:
  1209. if (ls->ls_dlm) {
  1210. dlm_release_lockspace(ls->ls_dlm, 2);
  1211. ls->ls_dlm = NULL;
  1212. }
  1213. free_recover_size(ls);
  1214. }
  1215. static const match_table_t dlm_tokens = {
  1216. { Opt_jid, "jid=%d"},
  1217. { Opt_id, "id=%d"},
  1218. { Opt_first, "first=%d"},
  1219. { Opt_nodir, "nodir=%d"},
  1220. { Opt_err, NULL },
  1221. };
  1222. const struct lm_lockops gfs2_dlm_ops = {
  1223. .lm_proto_name = "lock_dlm",
  1224. .lm_mount = gdlm_mount,
  1225. .lm_first_done = gdlm_first_done,
  1226. .lm_recovery_result = gdlm_recovery_result,
  1227. .lm_unmount = gdlm_unmount,
  1228. .lm_put_lock = gdlm_put_lock,
  1229. .lm_lock = gdlm_lock,
  1230. .lm_cancel = gdlm_cancel,
  1231. .lm_tokens = &dlm_tokens,
  1232. };