dm-cache-policy-smq.c 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953
  1. /*
  2. * Copyright (C) 2015 Red Hat. All rights reserved.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-cache-background-tracker.h"
  7. #include "dm-cache-policy-internal.h"
  8. #include "dm-cache-policy.h"
  9. #include "dm.h"
  10. #include <linux/hash.h>
  11. #include <linux/jiffies.h>
  12. #include <linux/module.h>
  13. #include <linux/mutex.h>
  14. #include <linux/vmalloc.h>
  15. #include <linux/math64.h>
  16. #define DM_MSG_PREFIX "cache-policy-smq"
  17. /*----------------------------------------------------------------*/
  18. /*
  19. * Safe division functions that return zero on divide by zero.
  20. */
  21. static unsigned int safe_div(unsigned int n, unsigned int d)
  22. {
  23. return d ? n / d : 0u;
  24. }
  25. static unsigned int safe_mod(unsigned int n, unsigned int d)
  26. {
  27. return d ? n % d : 0u;
  28. }
  29. /*----------------------------------------------------------------*/
  30. struct entry {
  31. unsigned int hash_next:28;
  32. unsigned int prev:28;
  33. unsigned int next:28;
  34. unsigned int level:6;
  35. bool dirty:1;
  36. bool allocated:1;
  37. bool sentinel:1;
  38. bool pending_work:1;
  39. dm_oblock_t oblock;
  40. };
  41. /*----------------------------------------------------------------*/
  42. #define INDEXER_NULL ((1u << 28u) - 1u)
  43. /*
  44. * An entry_space manages a set of entries that we use for the queues.
  45. * The clean and dirty queues share entries, so this object is separate
  46. * from the queue itself.
  47. */
  48. struct entry_space {
  49. struct entry *begin;
  50. struct entry *end;
  51. };
  52. static int space_init(struct entry_space *es, unsigned int nr_entries)
  53. {
  54. if (!nr_entries) {
  55. es->begin = es->end = NULL;
  56. return 0;
  57. }
  58. es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
  59. if (!es->begin)
  60. return -ENOMEM;
  61. es->end = es->begin + nr_entries;
  62. return 0;
  63. }
  64. static void space_exit(struct entry_space *es)
  65. {
  66. vfree(es->begin);
  67. }
  68. static struct entry *__get_entry(struct entry_space *es, unsigned int block)
  69. {
  70. struct entry *e;
  71. e = es->begin + block;
  72. BUG_ON(e >= es->end);
  73. return e;
  74. }
  75. static unsigned int to_index(struct entry_space *es, struct entry *e)
  76. {
  77. BUG_ON(e < es->begin || e >= es->end);
  78. return e - es->begin;
  79. }
  80. static struct entry *to_entry(struct entry_space *es, unsigned int block)
  81. {
  82. if (block == INDEXER_NULL)
  83. return NULL;
  84. return __get_entry(es, block);
  85. }
  86. /*----------------------------------------------------------------*/
  87. struct ilist {
  88. unsigned int nr_elts; /* excluding sentinel entries */
  89. unsigned int head, tail;
  90. };
  91. static void l_init(struct ilist *l)
  92. {
  93. l->nr_elts = 0;
  94. l->head = l->tail = INDEXER_NULL;
  95. }
  96. static struct entry *l_head(struct entry_space *es, struct ilist *l)
  97. {
  98. return to_entry(es, l->head);
  99. }
  100. static struct entry *l_tail(struct entry_space *es, struct ilist *l)
  101. {
  102. return to_entry(es, l->tail);
  103. }
  104. static struct entry *l_next(struct entry_space *es, struct entry *e)
  105. {
  106. return to_entry(es, e->next);
  107. }
  108. static struct entry *l_prev(struct entry_space *es, struct entry *e)
  109. {
  110. return to_entry(es, e->prev);
  111. }
  112. static bool l_empty(struct ilist *l)
  113. {
  114. return l->head == INDEXER_NULL;
  115. }
  116. static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
  117. {
  118. struct entry *head = l_head(es, l);
  119. e->next = l->head;
  120. e->prev = INDEXER_NULL;
  121. if (head)
  122. head->prev = l->head = to_index(es, e);
  123. else
  124. l->head = l->tail = to_index(es, e);
  125. if (!e->sentinel)
  126. l->nr_elts++;
  127. }
  128. static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
  129. {
  130. struct entry *tail = l_tail(es, l);
  131. e->next = INDEXER_NULL;
  132. e->prev = l->tail;
  133. if (tail)
  134. tail->next = l->tail = to_index(es, e);
  135. else
  136. l->head = l->tail = to_index(es, e);
  137. if (!e->sentinel)
  138. l->nr_elts++;
  139. }
  140. static void l_add_before(struct entry_space *es, struct ilist *l,
  141. struct entry *old, struct entry *e)
  142. {
  143. struct entry *prev = l_prev(es, old);
  144. if (!prev)
  145. l_add_head(es, l, e);
  146. else {
  147. e->prev = old->prev;
  148. e->next = to_index(es, old);
  149. prev->next = old->prev = to_index(es, e);
  150. if (!e->sentinel)
  151. l->nr_elts++;
  152. }
  153. }
  154. static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
  155. {
  156. struct entry *prev = l_prev(es, e);
  157. struct entry *next = l_next(es, e);
  158. if (prev)
  159. prev->next = e->next;
  160. else
  161. l->head = e->next;
  162. if (next)
  163. next->prev = e->prev;
  164. else
  165. l->tail = e->prev;
  166. if (!e->sentinel)
  167. l->nr_elts--;
  168. }
  169. static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
  170. {
  171. struct entry *e;
  172. for (e = l_head(es, l); e; e = l_next(es, e))
  173. if (!e->sentinel) {
  174. l_del(es, l, e);
  175. return e;
  176. }
  177. return NULL;
  178. }
  179. static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
  180. {
  181. struct entry *e;
  182. for (e = l_tail(es, l); e; e = l_prev(es, e))
  183. if (!e->sentinel) {
  184. l_del(es, l, e);
  185. return e;
  186. }
  187. return NULL;
  188. }
  189. /*----------------------------------------------------------------*/
  190. /*
  191. * The stochastic-multi-queue is a set of lru lists stacked into levels.
  192. * Entries are moved up levels when they are used, which loosely orders the
  193. * most accessed entries in the top levels and least in the bottom. This
  194. * structure is *much* better than a single lru list.
  195. */
  196. #define MAX_LEVELS 64u
  197. struct queue {
  198. struct entry_space *es;
  199. unsigned int nr_elts;
  200. unsigned int nr_levels;
  201. struct ilist qs[MAX_LEVELS];
  202. /*
  203. * We maintain a count of the number of entries we would like in each
  204. * level.
  205. */
  206. unsigned int last_target_nr_elts;
  207. unsigned int nr_top_levels;
  208. unsigned int nr_in_top_levels;
  209. unsigned int target_count[MAX_LEVELS];
  210. };
  211. static void q_init(struct queue *q, struct entry_space *es, unsigned int nr_levels)
  212. {
  213. unsigned int i;
  214. q->es = es;
  215. q->nr_elts = 0;
  216. q->nr_levels = nr_levels;
  217. for (i = 0; i < q->nr_levels; i++) {
  218. l_init(q->qs + i);
  219. q->target_count[i] = 0u;
  220. }
  221. q->last_target_nr_elts = 0u;
  222. q->nr_top_levels = 0u;
  223. q->nr_in_top_levels = 0u;
  224. }
  225. static unsigned int q_size(struct queue *q)
  226. {
  227. return q->nr_elts;
  228. }
  229. /*
  230. * Insert an entry to the back of the given level.
  231. */
  232. static void q_push(struct queue *q, struct entry *e)
  233. {
  234. BUG_ON(e->pending_work);
  235. if (!e->sentinel)
  236. q->nr_elts++;
  237. l_add_tail(q->es, q->qs + e->level, e);
  238. }
  239. static void q_push_front(struct queue *q, struct entry *e)
  240. {
  241. BUG_ON(e->pending_work);
  242. if (!e->sentinel)
  243. q->nr_elts++;
  244. l_add_head(q->es, q->qs + e->level, e);
  245. }
  246. static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
  247. {
  248. BUG_ON(e->pending_work);
  249. if (!e->sentinel)
  250. q->nr_elts++;
  251. l_add_before(q->es, q->qs + e->level, old, e);
  252. }
  253. static void q_del(struct queue *q, struct entry *e)
  254. {
  255. l_del(q->es, q->qs + e->level, e);
  256. if (!e->sentinel)
  257. q->nr_elts--;
  258. }
  259. /*
  260. * Return the oldest entry of the lowest populated level.
  261. */
  262. static struct entry *q_peek(struct queue *q, unsigned int max_level, bool can_cross_sentinel)
  263. {
  264. unsigned int level;
  265. struct entry *e;
  266. max_level = min(max_level, q->nr_levels);
  267. for (level = 0; level < max_level; level++)
  268. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
  269. if (e->sentinel) {
  270. if (can_cross_sentinel)
  271. continue;
  272. else
  273. break;
  274. }
  275. return e;
  276. }
  277. return NULL;
  278. }
  279. static struct entry *q_pop(struct queue *q)
  280. {
  281. struct entry *e = q_peek(q, q->nr_levels, true);
  282. if (e)
  283. q_del(q, e);
  284. return e;
  285. }
  286. /*
  287. * This function assumes there is a non-sentinel entry to pop. It's only
  288. * used by redistribute, so we know this is true. It also doesn't adjust
  289. * the q->nr_elts count.
  290. */
  291. static struct entry *__redist_pop_from(struct queue *q, unsigned int level)
  292. {
  293. struct entry *e;
  294. for (; level < q->nr_levels; level++)
  295. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
  296. if (!e->sentinel) {
  297. l_del(q->es, q->qs + e->level, e);
  298. return e;
  299. }
  300. return NULL;
  301. }
  302. static void q_set_targets_subrange_(struct queue *q, unsigned int nr_elts,
  303. unsigned int lbegin, unsigned int lend)
  304. {
  305. unsigned int level, nr_levels, entries_per_level, remainder;
  306. BUG_ON(lbegin > lend);
  307. BUG_ON(lend > q->nr_levels);
  308. nr_levels = lend - lbegin;
  309. entries_per_level = safe_div(nr_elts, nr_levels);
  310. remainder = safe_mod(nr_elts, nr_levels);
  311. for (level = lbegin; level < lend; level++)
  312. q->target_count[level] =
  313. (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
  314. }
  315. /*
  316. * Typically we have fewer elements in the top few levels which allows us
  317. * to adjust the promote threshold nicely.
  318. */
  319. static void q_set_targets(struct queue *q)
  320. {
  321. if (q->last_target_nr_elts == q->nr_elts)
  322. return;
  323. q->last_target_nr_elts = q->nr_elts;
  324. if (q->nr_top_levels > q->nr_levels)
  325. q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
  326. else {
  327. q_set_targets_subrange_(q, q->nr_in_top_levels,
  328. q->nr_levels - q->nr_top_levels, q->nr_levels);
  329. if (q->nr_in_top_levels < q->nr_elts)
  330. q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
  331. 0, q->nr_levels - q->nr_top_levels);
  332. else
  333. q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
  334. }
  335. }
  336. static void q_redistribute(struct queue *q)
  337. {
  338. unsigned int target, level;
  339. struct ilist *l, *l_above;
  340. struct entry *e;
  341. q_set_targets(q);
  342. for (level = 0u; level < q->nr_levels - 1u; level++) {
  343. l = q->qs + level;
  344. target = q->target_count[level];
  345. /*
  346. * Pull down some entries from the level above.
  347. */
  348. while (l->nr_elts < target) {
  349. e = __redist_pop_from(q, level + 1u);
  350. if (!e) {
  351. /* bug in nr_elts */
  352. break;
  353. }
  354. e->level = level;
  355. l_add_tail(q->es, l, e);
  356. }
  357. /*
  358. * Push some entries up.
  359. */
  360. l_above = q->qs + level + 1u;
  361. while (l->nr_elts > target) {
  362. e = l_pop_tail(q->es, l);
  363. if (!e)
  364. /* bug in nr_elts */
  365. break;
  366. e->level = level + 1u;
  367. l_add_tail(q->es, l_above, e);
  368. }
  369. }
  370. }
  371. static void q_requeue(struct queue *q, struct entry *e, unsigned int extra_levels,
  372. struct entry *s1, struct entry *s2)
  373. {
  374. struct entry *de;
  375. unsigned int sentinels_passed = 0;
  376. unsigned int new_level = min(q->nr_levels - 1u, e->level + extra_levels);
  377. /* try and find an entry to swap with */
  378. if (extra_levels && (e->level < q->nr_levels - 1u)) {
  379. for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
  380. sentinels_passed++;
  381. if (de) {
  382. q_del(q, de);
  383. de->level = e->level;
  384. if (s1) {
  385. switch (sentinels_passed) {
  386. case 0:
  387. q_push_before(q, s1, de);
  388. break;
  389. case 1:
  390. q_push_before(q, s2, de);
  391. break;
  392. default:
  393. q_push(q, de);
  394. }
  395. } else
  396. q_push(q, de);
  397. }
  398. }
  399. q_del(q, e);
  400. e->level = new_level;
  401. q_push(q, e);
  402. }
  403. /*----------------------------------------------------------------*/
  404. #define FP_SHIFT 8
  405. #define SIXTEENTH (1u << (FP_SHIFT - 4u))
  406. #define EIGHTH (1u << (FP_SHIFT - 3u))
  407. struct stats {
  408. unsigned int hit_threshold;
  409. unsigned int hits;
  410. unsigned int misses;
  411. };
  412. enum performance {
  413. Q_POOR,
  414. Q_FAIR,
  415. Q_WELL
  416. };
  417. static void stats_init(struct stats *s, unsigned int nr_levels)
  418. {
  419. s->hit_threshold = (nr_levels * 3u) / 4u;
  420. s->hits = 0u;
  421. s->misses = 0u;
  422. }
  423. static void stats_reset(struct stats *s)
  424. {
  425. s->hits = s->misses = 0u;
  426. }
  427. static void stats_level_accessed(struct stats *s, unsigned int level)
  428. {
  429. if (level >= s->hit_threshold)
  430. s->hits++;
  431. else
  432. s->misses++;
  433. }
  434. static void stats_miss(struct stats *s)
  435. {
  436. s->misses++;
  437. }
  438. /*
  439. * There are times when we don't have any confidence in the hotspot queue.
  440. * Such as when a fresh cache is created and the blocks have been spread
  441. * out across the levels, or if an io load changes. We detect this by
  442. * seeing how often a lookup is in the top levels of the hotspot queue.
  443. */
  444. static enum performance stats_assess(struct stats *s)
  445. {
  446. unsigned int confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
  447. if (confidence < SIXTEENTH)
  448. return Q_POOR;
  449. else if (confidence < EIGHTH)
  450. return Q_FAIR;
  451. else
  452. return Q_WELL;
  453. }
  454. /*----------------------------------------------------------------*/
  455. struct smq_hash_table {
  456. struct entry_space *es;
  457. unsigned long long hash_bits;
  458. unsigned int *buckets;
  459. };
  460. /*
  461. * All cache entries are stored in a chained hash table. To save space we
  462. * use indexing again, and only store indexes to the next entry.
  463. */
  464. static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned int nr_entries)
  465. {
  466. unsigned int i, nr_buckets;
  467. ht->es = es;
  468. nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
  469. ht->hash_bits = __ffs(nr_buckets);
  470. ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
  471. if (!ht->buckets)
  472. return -ENOMEM;
  473. for (i = 0; i < nr_buckets; i++)
  474. ht->buckets[i] = INDEXER_NULL;
  475. return 0;
  476. }
  477. static void h_exit(struct smq_hash_table *ht)
  478. {
  479. vfree(ht->buckets);
  480. }
  481. static struct entry *h_head(struct smq_hash_table *ht, unsigned int bucket)
  482. {
  483. return to_entry(ht->es, ht->buckets[bucket]);
  484. }
  485. static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
  486. {
  487. return to_entry(ht->es, e->hash_next);
  488. }
  489. static void __h_insert(struct smq_hash_table *ht, unsigned int bucket, struct entry *e)
  490. {
  491. e->hash_next = ht->buckets[bucket];
  492. ht->buckets[bucket] = to_index(ht->es, e);
  493. }
  494. static void h_insert(struct smq_hash_table *ht, struct entry *e)
  495. {
  496. unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  497. __h_insert(ht, h, e);
  498. }
  499. static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned int h, dm_oblock_t oblock,
  500. struct entry **prev)
  501. {
  502. struct entry *e;
  503. *prev = NULL;
  504. for (e = h_head(ht, h); e; e = h_next(ht, e)) {
  505. if (e->oblock == oblock)
  506. return e;
  507. *prev = e;
  508. }
  509. return NULL;
  510. }
  511. static void __h_unlink(struct smq_hash_table *ht, unsigned int h,
  512. struct entry *e, struct entry *prev)
  513. {
  514. if (prev)
  515. prev->hash_next = e->hash_next;
  516. else
  517. ht->buckets[h] = e->hash_next;
  518. }
  519. /*
  520. * Also moves each entry to the front of the bucket.
  521. */
  522. static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
  523. {
  524. struct entry *e, *prev;
  525. unsigned int h = hash_64(from_oblock(oblock), ht->hash_bits);
  526. e = __h_lookup(ht, h, oblock, &prev);
  527. if (e && prev) {
  528. /*
  529. * Move to the front because this entry is likely
  530. * to be hit again.
  531. */
  532. __h_unlink(ht, h, e, prev);
  533. __h_insert(ht, h, e);
  534. }
  535. return e;
  536. }
  537. static void h_remove(struct smq_hash_table *ht, struct entry *e)
  538. {
  539. unsigned int h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  540. struct entry *prev;
  541. /*
  542. * The down side of using a singly linked list is we have to
  543. * iterate the bucket to remove an item.
  544. */
  545. e = __h_lookup(ht, h, e->oblock, &prev);
  546. if (e)
  547. __h_unlink(ht, h, e, prev);
  548. }
  549. /*----------------------------------------------------------------*/
  550. struct entry_alloc {
  551. struct entry_space *es;
  552. unsigned int begin;
  553. unsigned int nr_allocated;
  554. struct ilist free;
  555. };
  556. static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
  557. unsigned int begin, unsigned int end)
  558. {
  559. unsigned int i;
  560. ea->es = es;
  561. ea->nr_allocated = 0u;
  562. ea->begin = begin;
  563. l_init(&ea->free);
  564. for (i = begin; i != end; i++)
  565. l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
  566. }
  567. static void init_entry(struct entry *e)
  568. {
  569. /*
  570. * We can't memset because that would clear the hotspot and
  571. * sentinel bits which remain constant.
  572. */
  573. e->hash_next = INDEXER_NULL;
  574. e->next = INDEXER_NULL;
  575. e->prev = INDEXER_NULL;
  576. e->level = 0u;
  577. e->dirty = true; /* FIXME: audit */
  578. e->allocated = true;
  579. e->sentinel = false;
  580. e->pending_work = false;
  581. }
  582. static struct entry *alloc_entry(struct entry_alloc *ea)
  583. {
  584. struct entry *e;
  585. if (l_empty(&ea->free))
  586. return NULL;
  587. e = l_pop_head(ea->es, &ea->free);
  588. init_entry(e);
  589. ea->nr_allocated++;
  590. return e;
  591. }
  592. /*
  593. * This assumes the cblock hasn't already been allocated.
  594. */
  595. static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned int i)
  596. {
  597. struct entry *e = __get_entry(ea->es, ea->begin + i);
  598. BUG_ON(e->allocated);
  599. l_del(ea->es, &ea->free, e);
  600. init_entry(e);
  601. ea->nr_allocated++;
  602. return e;
  603. }
  604. static void free_entry(struct entry_alloc *ea, struct entry *e)
  605. {
  606. BUG_ON(!ea->nr_allocated);
  607. BUG_ON(!e->allocated);
  608. ea->nr_allocated--;
  609. e->allocated = false;
  610. l_add_tail(ea->es, &ea->free, e);
  611. }
  612. static bool allocator_empty(struct entry_alloc *ea)
  613. {
  614. return l_empty(&ea->free);
  615. }
  616. static unsigned int get_index(struct entry_alloc *ea, struct entry *e)
  617. {
  618. return to_index(ea->es, e) - ea->begin;
  619. }
  620. static struct entry *get_entry(struct entry_alloc *ea, unsigned int index)
  621. {
  622. return __get_entry(ea->es, ea->begin + index);
  623. }
  624. /*----------------------------------------------------------------*/
  625. #define NR_HOTSPOT_LEVELS 64u
  626. #define NR_CACHE_LEVELS 64u
  627. #define WRITEBACK_PERIOD (10ul * HZ)
  628. #define DEMOTE_PERIOD (60ul * HZ)
  629. #define HOTSPOT_UPDATE_PERIOD (HZ)
  630. #define CACHE_UPDATE_PERIOD (60ul * HZ)
  631. struct smq_policy {
  632. struct dm_cache_policy policy;
  633. /* protects everything */
  634. spinlock_t lock;
  635. dm_cblock_t cache_size;
  636. sector_t cache_block_size;
  637. sector_t hotspot_block_size;
  638. unsigned int nr_hotspot_blocks;
  639. unsigned int cache_blocks_per_hotspot_block;
  640. unsigned int hotspot_level_jump;
  641. struct entry_space es;
  642. struct entry_alloc writeback_sentinel_alloc;
  643. struct entry_alloc demote_sentinel_alloc;
  644. struct entry_alloc hotspot_alloc;
  645. struct entry_alloc cache_alloc;
  646. unsigned long *hotspot_hit_bits;
  647. unsigned long *cache_hit_bits;
  648. /*
  649. * We maintain three queues of entries. The cache proper,
  650. * consisting of a clean and dirty queue, containing the currently
  651. * active mappings. The hotspot queue uses a larger block size to
  652. * track blocks that are being hit frequently and potential
  653. * candidates for promotion to the cache.
  654. */
  655. struct queue hotspot;
  656. struct queue clean;
  657. struct queue dirty;
  658. struct stats hotspot_stats;
  659. struct stats cache_stats;
  660. /*
  661. * Keeps track of time, incremented by the core. We use this to
  662. * avoid attributing multiple hits within the same tick.
  663. */
  664. unsigned int tick;
  665. /*
  666. * The hash tables allows us to quickly find an entry by origin
  667. * block.
  668. */
  669. struct smq_hash_table table;
  670. struct smq_hash_table hotspot_table;
  671. bool current_writeback_sentinels;
  672. unsigned long next_writeback_period;
  673. bool current_demote_sentinels;
  674. unsigned long next_demote_period;
  675. unsigned int write_promote_level;
  676. unsigned int read_promote_level;
  677. unsigned long next_hotspot_period;
  678. unsigned long next_cache_period;
  679. struct background_tracker *bg_work;
  680. bool migrations_allowed:1;
  681. /*
  682. * If this is set the policy will try and clean the whole cache
  683. * even if the device is not idle.
  684. */
  685. bool cleaner:1;
  686. };
  687. /*----------------------------------------------------------------*/
  688. static struct entry *get_sentinel(struct entry_alloc *ea, unsigned int level, bool which)
  689. {
  690. return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
  691. }
  692. static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned int level)
  693. {
  694. return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
  695. }
  696. static struct entry *demote_sentinel(struct smq_policy *mq, unsigned int level)
  697. {
  698. return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
  699. }
  700. static void __update_writeback_sentinels(struct smq_policy *mq)
  701. {
  702. unsigned int level;
  703. struct queue *q = &mq->dirty;
  704. struct entry *sentinel;
  705. for (level = 0; level < q->nr_levels; level++) {
  706. sentinel = writeback_sentinel(mq, level);
  707. q_del(q, sentinel);
  708. q_push(q, sentinel);
  709. }
  710. }
  711. static void __update_demote_sentinels(struct smq_policy *mq)
  712. {
  713. unsigned int level;
  714. struct queue *q = &mq->clean;
  715. struct entry *sentinel;
  716. for (level = 0; level < q->nr_levels; level++) {
  717. sentinel = demote_sentinel(mq, level);
  718. q_del(q, sentinel);
  719. q_push(q, sentinel);
  720. }
  721. }
  722. static void update_sentinels(struct smq_policy *mq)
  723. {
  724. if (time_after(jiffies, mq->next_writeback_period)) {
  725. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  726. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  727. __update_writeback_sentinels(mq);
  728. }
  729. if (time_after(jiffies, mq->next_demote_period)) {
  730. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  731. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  732. __update_demote_sentinels(mq);
  733. }
  734. }
  735. static void __sentinels_init(struct smq_policy *mq)
  736. {
  737. unsigned int level;
  738. struct entry *sentinel;
  739. for (level = 0; level < NR_CACHE_LEVELS; level++) {
  740. sentinel = writeback_sentinel(mq, level);
  741. sentinel->level = level;
  742. q_push(&mq->dirty, sentinel);
  743. sentinel = demote_sentinel(mq, level);
  744. sentinel->level = level;
  745. q_push(&mq->clean, sentinel);
  746. }
  747. }
  748. static void sentinels_init(struct smq_policy *mq)
  749. {
  750. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  751. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  752. mq->current_writeback_sentinels = false;
  753. mq->current_demote_sentinels = false;
  754. __sentinels_init(mq);
  755. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  756. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  757. __sentinels_init(mq);
  758. }
  759. /*----------------------------------------------------------------*/
  760. static void del_queue(struct smq_policy *mq, struct entry *e)
  761. {
  762. q_del(e->dirty ? &mq->dirty : &mq->clean, e);
  763. }
  764. static void push_queue(struct smq_policy *mq, struct entry *e)
  765. {
  766. if (e->dirty)
  767. q_push(&mq->dirty, e);
  768. else
  769. q_push(&mq->clean, e);
  770. }
  771. // !h, !q, a -> h, q, a
  772. static void push(struct smq_policy *mq, struct entry *e)
  773. {
  774. h_insert(&mq->table, e);
  775. if (!e->pending_work)
  776. push_queue(mq, e);
  777. }
  778. static void push_queue_front(struct smq_policy *mq, struct entry *e)
  779. {
  780. if (e->dirty)
  781. q_push_front(&mq->dirty, e);
  782. else
  783. q_push_front(&mq->clean, e);
  784. }
  785. static void push_front(struct smq_policy *mq, struct entry *e)
  786. {
  787. h_insert(&mq->table, e);
  788. if (!e->pending_work)
  789. push_queue_front(mq, e);
  790. }
  791. static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
  792. {
  793. return to_cblock(get_index(&mq->cache_alloc, e));
  794. }
  795. static void requeue(struct smq_policy *mq, struct entry *e)
  796. {
  797. /*
  798. * Pending work has temporarily been taken out of the queues.
  799. */
  800. if (e->pending_work)
  801. return;
  802. if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
  803. if (!e->dirty) {
  804. q_requeue(&mq->clean, e, 1u, NULL, NULL);
  805. return;
  806. }
  807. q_requeue(&mq->dirty, e, 1u,
  808. get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
  809. get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
  810. }
  811. }
  812. static unsigned int default_promote_level(struct smq_policy *mq)
  813. {
  814. /*
  815. * The promote level depends on the current performance of the
  816. * cache.
  817. *
  818. * If the cache is performing badly, then we can't afford
  819. * to promote much without causing performance to drop below that
  820. * of the origin device.
  821. *
  822. * If the cache is performing well, then we don't need to promote
  823. * much. If it isn't broken, don't fix it.
  824. *
  825. * If the cache is middling then we promote more.
  826. *
  827. * This scheme reminds me of a graph of entropy vs probability of a
  828. * binary variable.
  829. */
  830. static const unsigned int table[] = {
  831. 1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1
  832. };
  833. unsigned int hits = mq->cache_stats.hits;
  834. unsigned int misses = mq->cache_stats.misses;
  835. unsigned int index = safe_div(hits << 4u, hits + misses);
  836. return table[index];
  837. }
  838. static void update_promote_levels(struct smq_policy *mq)
  839. {
  840. /*
  841. * If there are unused cache entries then we want to be really
  842. * eager to promote.
  843. */
  844. unsigned int threshold_level = allocator_empty(&mq->cache_alloc) ?
  845. default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
  846. threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
  847. /*
  848. * If the hotspot queue is performing badly then we have little
  849. * confidence that we know which blocks to promote. So we cut down
  850. * the amount of promotions.
  851. */
  852. switch (stats_assess(&mq->hotspot_stats)) {
  853. case Q_POOR:
  854. threshold_level /= 4u;
  855. break;
  856. case Q_FAIR:
  857. threshold_level /= 2u;
  858. break;
  859. case Q_WELL:
  860. break;
  861. }
  862. mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
  863. mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
  864. }
  865. /*
  866. * If the hotspot queue is performing badly, then we try and move entries
  867. * around more quickly.
  868. */
  869. static void update_level_jump(struct smq_policy *mq)
  870. {
  871. switch (stats_assess(&mq->hotspot_stats)) {
  872. case Q_POOR:
  873. mq->hotspot_level_jump = 4u;
  874. break;
  875. case Q_FAIR:
  876. mq->hotspot_level_jump = 2u;
  877. break;
  878. case Q_WELL:
  879. mq->hotspot_level_jump = 1u;
  880. break;
  881. }
  882. }
  883. static void end_hotspot_period(struct smq_policy *mq)
  884. {
  885. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  886. update_promote_levels(mq);
  887. if (time_after(jiffies, mq->next_hotspot_period)) {
  888. update_level_jump(mq);
  889. q_redistribute(&mq->hotspot);
  890. stats_reset(&mq->hotspot_stats);
  891. mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
  892. }
  893. }
  894. static void end_cache_period(struct smq_policy *mq)
  895. {
  896. if (time_after(jiffies, mq->next_cache_period)) {
  897. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  898. q_redistribute(&mq->dirty);
  899. q_redistribute(&mq->clean);
  900. stats_reset(&mq->cache_stats);
  901. mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
  902. }
  903. }
  904. /*----------------------------------------------------------------*/
  905. /*
  906. * Targets are given as a percentage.
  907. */
  908. #define CLEAN_TARGET 25u
  909. #define FREE_TARGET 25u
  910. static unsigned int percent_to_target(struct smq_policy *mq, unsigned int p)
  911. {
  912. return from_cblock(mq->cache_size) * p / 100u;
  913. }
  914. static bool clean_target_met(struct smq_policy *mq, bool idle)
  915. {
  916. /*
  917. * Cache entries may not be populated. So we cannot rely on the
  918. * size of the clean queue.
  919. */
  920. if (idle || mq->cleaner) {
  921. /*
  922. * We'd like to clean everything.
  923. */
  924. return q_size(&mq->dirty) == 0u;
  925. }
  926. /*
  927. * If we're busy we don't worry about cleaning at all.
  928. */
  929. return true;
  930. }
  931. static bool free_target_met(struct smq_policy *mq)
  932. {
  933. unsigned int nr_free;
  934. nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
  935. return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
  936. percent_to_target(mq, FREE_TARGET);
  937. }
  938. /*----------------------------------------------------------------*/
  939. static void mark_pending(struct smq_policy *mq, struct entry *e)
  940. {
  941. BUG_ON(e->sentinel);
  942. BUG_ON(!e->allocated);
  943. BUG_ON(e->pending_work);
  944. e->pending_work = true;
  945. }
  946. static void clear_pending(struct smq_policy *mq, struct entry *e)
  947. {
  948. BUG_ON(!e->pending_work);
  949. e->pending_work = false;
  950. }
  951. static void queue_writeback(struct smq_policy *mq, bool idle)
  952. {
  953. int r;
  954. struct policy_work work;
  955. struct entry *e;
  956. e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
  957. if (e) {
  958. mark_pending(mq, e);
  959. q_del(&mq->dirty, e);
  960. work.op = POLICY_WRITEBACK;
  961. work.oblock = e->oblock;
  962. work.cblock = infer_cblock(mq, e);
  963. r = btracker_queue(mq->bg_work, &work, NULL);
  964. if (r) {
  965. clear_pending(mq, e);
  966. q_push_front(&mq->dirty, e);
  967. }
  968. }
  969. }
  970. static void queue_demotion(struct smq_policy *mq)
  971. {
  972. int r;
  973. struct policy_work work;
  974. struct entry *e;
  975. if (WARN_ON_ONCE(!mq->migrations_allowed))
  976. return;
  977. e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
  978. if (!e) {
  979. if (!clean_target_met(mq, true))
  980. queue_writeback(mq, false);
  981. return;
  982. }
  983. mark_pending(mq, e);
  984. q_del(&mq->clean, e);
  985. work.op = POLICY_DEMOTE;
  986. work.oblock = e->oblock;
  987. work.cblock = infer_cblock(mq, e);
  988. r = btracker_queue(mq->bg_work, &work, NULL);
  989. if (r) {
  990. clear_pending(mq, e);
  991. q_push_front(&mq->clean, e);
  992. }
  993. }
  994. static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
  995. struct policy_work **workp)
  996. {
  997. int r;
  998. struct entry *e;
  999. struct policy_work work;
  1000. if (!mq->migrations_allowed)
  1001. return;
  1002. if (allocator_empty(&mq->cache_alloc)) {
  1003. /*
  1004. * We always claim to be 'idle' to ensure some demotions happen
  1005. * with continuous loads.
  1006. */
  1007. if (!free_target_met(mq))
  1008. queue_demotion(mq);
  1009. return;
  1010. }
  1011. if (btracker_promotion_already_present(mq->bg_work, oblock))
  1012. return;
  1013. /*
  1014. * We allocate the entry now to reserve the cblock. If the
  1015. * background work is aborted we must remember to free it.
  1016. */
  1017. e = alloc_entry(&mq->cache_alloc);
  1018. BUG_ON(!e);
  1019. e->pending_work = true;
  1020. work.op = POLICY_PROMOTE;
  1021. work.oblock = oblock;
  1022. work.cblock = infer_cblock(mq, e);
  1023. r = btracker_queue(mq->bg_work, &work, workp);
  1024. if (r)
  1025. free_entry(&mq->cache_alloc, e);
  1026. }
  1027. /*----------------------------------------------------------------*/
  1028. enum promote_result {
  1029. PROMOTE_NOT,
  1030. PROMOTE_TEMPORARY,
  1031. PROMOTE_PERMANENT
  1032. };
  1033. /*
  1034. * Converts a boolean into a promote result.
  1035. */
  1036. static enum promote_result maybe_promote(bool promote)
  1037. {
  1038. return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
  1039. }
  1040. static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
  1041. int data_dir, bool fast_promote)
  1042. {
  1043. if (data_dir == WRITE) {
  1044. if (!allocator_empty(&mq->cache_alloc) && fast_promote)
  1045. return PROMOTE_TEMPORARY;
  1046. return maybe_promote(hs_e->level >= mq->write_promote_level);
  1047. } else
  1048. return maybe_promote(hs_e->level >= mq->read_promote_level);
  1049. }
  1050. static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
  1051. {
  1052. sector_t r = from_oblock(b);
  1053. (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
  1054. return to_oblock(r);
  1055. }
  1056. static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
  1057. {
  1058. unsigned int hi;
  1059. dm_oblock_t hb = to_hblock(mq, b);
  1060. struct entry *e = h_lookup(&mq->hotspot_table, hb);
  1061. if (e) {
  1062. stats_level_accessed(&mq->hotspot_stats, e->level);
  1063. hi = get_index(&mq->hotspot_alloc, e);
  1064. q_requeue(&mq->hotspot, e,
  1065. test_and_set_bit(hi, mq->hotspot_hit_bits) ?
  1066. 0u : mq->hotspot_level_jump,
  1067. NULL, NULL);
  1068. } else {
  1069. stats_miss(&mq->hotspot_stats);
  1070. e = alloc_entry(&mq->hotspot_alloc);
  1071. if (!e) {
  1072. e = q_pop(&mq->hotspot);
  1073. if (e) {
  1074. h_remove(&mq->hotspot_table, e);
  1075. hi = get_index(&mq->hotspot_alloc, e);
  1076. clear_bit(hi, mq->hotspot_hit_bits);
  1077. }
  1078. }
  1079. if (e) {
  1080. e->oblock = hb;
  1081. q_push(&mq->hotspot, e);
  1082. h_insert(&mq->hotspot_table, e);
  1083. }
  1084. }
  1085. return e;
  1086. }
  1087. /*----------------------------------------------------------------*/
  1088. /*
  1089. * Public interface, via the policy struct. See dm-cache-policy.h for a
  1090. * description of these.
  1091. */
  1092. static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
  1093. {
  1094. return container_of(p, struct smq_policy, policy);
  1095. }
  1096. static void smq_destroy(struct dm_cache_policy *p)
  1097. {
  1098. struct smq_policy *mq = to_smq_policy(p);
  1099. btracker_destroy(mq->bg_work);
  1100. h_exit(&mq->hotspot_table);
  1101. h_exit(&mq->table);
  1102. free_bitset(mq->hotspot_hit_bits);
  1103. free_bitset(mq->cache_hit_bits);
  1104. space_exit(&mq->es);
  1105. kfree(mq);
  1106. }
  1107. /*----------------------------------------------------------------*/
  1108. static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
  1109. int data_dir, bool fast_copy,
  1110. struct policy_work **work, bool *background_work)
  1111. {
  1112. struct entry *e, *hs_e;
  1113. enum promote_result pr;
  1114. *background_work = false;
  1115. e = h_lookup(&mq->table, oblock);
  1116. if (e) {
  1117. stats_level_accessed(&mq->cache_stats, e->level);
  1118. requeue(mq, e);
  1119. *cblock = infer_cblock(mq, e);
  1120. return 0;
  1121. } else {
  1122. stats_miss(&mq->cache_stats);
  1123. /*
  1124. * The hotspot queue only gets updated with misses.
  1125. */
  1126. hs_e = update_hotspot_queue(mq, oblock);
  1127. pr = should_promote(mq, hs_e, data_dir, fast_copy);
  1128. if (pr != PROMOTE_NOT) {
  1129. queue_promotion(mq, oblock, work);
  1130. *background_work = true;
  1131. }
  1132. return -ENOENT;
  1133. }
  1134. }
  1135. static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
  1136. int data_dir, bool fast_copy,
  1137. bool *background_work)
  1138. {
  1139. int r;
  1140. unsigned long flags;
  1141. struct smq_policy *mq = to_smq_policy(p);
  1142. spin_lock_irqsave(&mq->lock, flags);
  1143. r = __lookup(mq, oblock, cblock,
  1144. data_dir, fast_copy,
  1145. NULL, background_work);
  1146. spin_unlock_irqrestore(&mq->lock, flags);
  1147. return r;
  1148. }
  1149. static int smq_lookup_with_work(struct dm_cache_policy *p,
  1150. dm_oblock_t oblock, dm_cblock_t *cblock,
  1151. int data_dir, bool fast_copy,
  1152. struct policy_work **work)
  1153. {
  1154. int r;
  1155. bool background_queued;
  1156. unsigned long flags;
  1157. struct smq_policy *mq = to_smq_policy(p);
  1158. spin_lock_irqsave(&mq->lock, flags);
  1159. r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
  1160. spin_unlock_irqrestore(&mq->lock, flags);
  1161. return r;
  1162. }
  1163. static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
  1164. struct policy_work **result)
  1165. {
  1166. int r;
  1167. unsigned long flags;
  1168. struct smq_policy *mq = to_smq_policy(p);
  1169. spin_lock_irqsave(&mq->lock, flags);
  1170. r = btracker_issue(mq->bg_work, result);
  1171. if (r == -ENODATA) {
  1172. if (!clean_target_met(mq, idle)) {
  1173. queue_writeback(mq, idle);
  1174. r = btracker_issue(mq->bg_work, result);
  1175. }
  1176. }
  1177. spin_unlock_irqrestore(&mq->lock, flags);
  1178. return r;
  1179. }
  1180. /*
  1181. * We need to clear any pending work flags that have been set, and in the
  1182. * case of promotion free the entry for the destination cblock.
  1183. */
  1184. static void __complete_background_work(struct smq_policy *mq,
  1185. struct policy_work *work,
  1186. bool success)
  1187. {
  1188. struct entry *e = get_entry(&mq->cache_alloc,
  1189. from_cblock(work->cblock));
  1190. switch (work->op) {
  1191. case POLICY_PROMOTE:
  1192. // !h, !q, a
  1193. clear_pending(mq, e);
  1194. if (success) {
  1195. e->oblock = work->oblock;
  1196. e->level = NR_CACHE_LEVELS - 1;
  1197. push(mq, e);
  1198. // h, q, a
  1199. } else {
  1200. free_entry(&mq->cache_alloc, e);
  1201. // !h, !q, !a
  1202. }
  1203. break;
  1204. case POLICY_DEMOTE:
  1205. // h, !q, a
  1206. if (success) {
  1207. h_remove(&mq->table, e);
  1208. free_entry(&mq->cache_alloc, e);
  1209. // !h, !q, !a
  1210. } else {
  1211. clear_pending(mq, e);
  1212. push_queue(mq, e);
  1213. // h, q, a
  1214. }
  1215. break;
  1216. case POLICY_WRITEBACK:
  1217. // h, !q, a
  1218. clear_pending(mq, e);
  1219. push_queue(mq, e);
  1220. // h, q, a
  1221. break;
  1222. }
  1223. btracker_complete(mq->bg_work, work);
  1224. }
  1225. static void smq_complete_background_work(struct dm_cache_policy *p,
  1226. struct policy_work *work,
  1227. bool success)
  1228. {
  1229. unsigned long flags;
  1230. struct smq_policy *mq = to_smq_policy(p);
  1231. spin_lock_irqsave(&mq->lock, flags);
  1232. __complete_background_work(mq, work, success);
  1233. spin_unlock_irqrestore(&mq->lock, flags);
  1234. }
  1235. // in_hash(oblock) -> in_hash(oblock)
  1236. static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
  1237. {
  1238. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1239. if (e->pending_work)
  1240. e->dirty = set;
  1241. else {
  1242. del_queue(mq, e);
  1243. e->dirty = set;
  1244. push_queue(mq, e);
  1245. }
  1246. }
  1247. static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1248. {
  1249. unsigned long flags;
  1250. struct smq_policy *mq = to_smq_policy(p);
  1251. spin_lock_irqsave(&mq->lock, flags);
  1252. __smq_set_clear_dirty(mq, cblock, true);
  1253. spin_unlock_irqrestore(&mq->lock, flags);
  1254. }
  1255. static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1256. {
  1257. struct smq_policy *mq = to_smq_policy(p);
  1258. unsigned long flags;
  1259. spin_lock_irqsave(&mq->lock, flags);
  1260. __smq_set_clear_dirty(mq, cblock, false);
  1261. spin_unlock_irqrestore(&mq->lock, flags);
  1262. }
  1263. static unsigned int random_level(dm_cblock_t cblock)
  1264. {
  1265. return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
  1266. }
  1267. static int smq_load_mapping(struct dm_cache_policy *p,
  1268. dm_oblock_t oblock, dm_cblock_t cblock,
  1269. bool dirty, uint32_t hint, bool hint_valid)
  1270. {
  1271. struct smq_policy *mq = to_smq_policy(p);
  1272. struct entry *e;
  1273. e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
  1274. e->oblock = oblock;
  1275. e->dirty = dirty;
  1276. e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
  1277. e->pending_work = false;
  1278. /*
  1279. * When we load mappings we push ahead of both sentinels in order to
  1280. * allow demotions and cleaning to occur immediately.
  1281. */
  1282. push_front(mq, e);
  1283. return 0;
  1284. }
  1285. static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
  1286. {
  1287. struct smq_policy *mq = to_smq_policy(p);
  1288. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1289. if (!e->allocated)
  1290. return -ENODATA;
  1291. // FIXME: what if this block has pending background work?
  1292. del_queue(mq, e);
  1293. h_remove(&mq->table, e);
  1294. free_entry(&mq->cache_alloc, e);
  1295. return 0;
  1296. }
  1297. static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
  1298. {
  1299. struct smq_policy *mq = to_smq_policy(p);
  1300. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1301. if (!e->allocated)
  1302. return 0;
  1303. return e->level;
  1304. }
  1305. static dm_cblock_t smq_residency(struct dm_cache_policy *p)
  1306. {
  1307. dm_cblock_t r;
  1308. unsigned long flags;
  1309. struct smq_policy *mq = to_smq_policy(p);
  1310. spin_lock_irqsave(&mq->lock, flags);
  1311. r = to_cblock(mq->cache_alloc.nr_allocated);
  1312. spin_unlock_irqrestore(&mq->lock, flags);
  1313. return r;
  1314. }
  1315. static void smq_tick(struct dm_cache_policy *p, bool can_block)
  1316. {
  1317. struct smq_policy *mq = to_smq_policy(p);
  1318. unsigned long flags;
  1319. spin_lock_irqsave(&mq->lock, flags);
  1320. mq->tick++;
  1321. update_sentinels(mq);
  1322. end_hotspot_period(mq);
  1323. end_cache_period(mq);
  1324. spin_unlock_irqrestore(&mq->lock, flags);
  1325. }
  1326. static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
  1327. {
  1328. struct smq_policy *mq = to_smq_policy(p);
  1329. mq->migrations_allowed = allow;
  1330. }
  1331. /*
  1332. * smq has no config values, but the old mq policy did. To avoid breaking
  1333. * software we continue to accept these configurables for the mq policy,
  1334. * but they have no effect.
  1335. */
  1336. static int mq_set_config_value(struct dm_cache_policy *p,
  1337. const char *key, const char *value)
  1338. {
  1339. unsigned long tmp;
  1340. if (kstrtoul(value, 10, &tmp))
  1341. return -EINVAL;
  1342. if (!strcasecmp(key, "random_threshold") ||
  1343. !strcasecmp(key, "sequential_threshold") ||
  1344. !strcasecmp(key, "discard_promote_adjustment") ||
  1345. !strcasecmp(key, "read_promote_adjustment") ||
  1346. !strcasecmp(key, "write_promote_adjustment")) {
  1347. DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
  1348. return 0;
  1349. }
  1350. return -EINVAL;
  1351. }
  1352. static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
  1353. unsigned int maxlen, ssize_t *sz_ptr)
  1354. {
  1355. ssize_t sz = *sz_ptr;
  1356. DMEMIT("10 random_threshold 0 "
  1357. "sequential_threshold 0 "
  1358. "discard_promote_adjustment 0 "
  1359. "read_promote_adjustment 0 "
  1360. "write_promote_adjustment 0 ");
  1361. *sz_ptr = sz;
  1362. return 0;
  1363. }
  1364. /* Init the policy plugin interface function pointers. */
  1365. static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
  1366. {
  1367. mq->policy.destroy = smq_destroy;
  1368. mq->policy.lookup = smq_lookup;
  1369. mq->policy.lookup_with_work = smq_lookup_with_work;
  1370. mq->policy.get_background_work = smq_get_background_work;
  1371. mq->policy.complete_background_work = smq_complete_background_work;
  1372. mq->policy.set_dirty = smq_set_dirty;
  1373. mq->policy.clear_dirty = smq_clear_dirty;
  1374. mq->policy.load_mapping = smq_load_mapping;
  1375. mq->policy.invalidate_mapping = smq_invalidate_mapping;
  1376. mq->policy.get_hint = smq_get_hint;
  1377. mq->policy.residency = smq_residency;
  1378. mq->policy.tick = smq_tick;
  1379. mq->policy.allow_migrations = smq_allow_migrations;
  1380. if (mimic_mq) {
  1381. mq->policy.set_config_value = mq_set_config_value;
  1382. mq->policy.emit_config_values = mq_emit_config_values;
  1383. }
  1384. }
  1385. static bool too_many_hotspot_blocks(sector_t origin_size,
  1386. sector_t hotspot_block_size,
  1387. unsigned int nr_hotspot_blocks)
  1388. {
  1389. return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
  1390. }
  1391. static void calc_hotspot_params(sector_t origin_size,
  1392. sector_t cache_block_size,
  1393. unsigned int nr_cache_blocks,
  1394. sector_t *hotspot_block_size,
  1395. unsigned int *nr_hotspot_blocks)
  1396. {
  1397. *hotspot_block_size = cache_block_size * 16u;
  1398. *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
  1399. while ((*hotspot_block_size > cache_block_size) &&
  1400. too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
  1401. *hotspot_block_size /= 2u;
  1402. }
  1403. static struct dm_cache_policy *
  1404. __smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
  1405. bool mimic_mq, bool migrations_allowed, bool cleaner)
  1406. {
  1407. unsigned int i;
  1408. unsigned int nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
  1409. unsigned int total_sentinels = 2u * nr_sentinels_per_queue;
  1410. struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
  1411. if (!mq)
  1412. return NULL;
  1413. init_policy_functions(mq, mimic_mq);
  1414. mq->cache_size = cache_size;
  1415. mq->cache_block_size = cache_block_size;
  1416. calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
  1417. &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
  1418. mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
  1419. mq->hotspot_level_jump = 1u;
  1420. if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
  1421. DMERR("couldn't initialize entry space");
  1422. goto bad_pool_init;
  1423. }
  1424. init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
  1425. for (i = 0; i < nr_sentinels_per_queue; i++)
  1426. get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
  1427. init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
  1428. for (i = 0; i < nr_sentinels_per_queue; i++)
  1429. get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
  1430. init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
  1431. total_sentinels + mq->nr_hotspot_blocks);
  1432. init_allocator(&mq->cache_alloc, &mq->es,
  1433. total_sentinels + mq->nr_hotspot_blocks,
  1434. total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
  1435. mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
  1436. if (!mq->hotspot_hit_bits) {
  1437. DMERR("couldn't allocate hotspot hit bitset");
  1438. goto bad_hotspot_hit_bits;
  1439. }
  1440. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  1441. if (from_cblock(cache_size)) {
  1442. mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
  1443. if (!mq->cache_hit_bits) {
  1444. DMERR("couldn't allocate cache hit bitset");
  1445. goto bad_cache_hit_bits;
  1446. }
  1447. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  1448. } else
  1449. mq->cache_hit_bits = NULL;
  1450. mq->tick = 0;
  1451. spin_lock_init(&mq->lock);
  1452. q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
  1453. mq->hotspot.nr_top_levels = 8;
  1454. mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
  1455. from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
  1456. q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
  1457. q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
  1458. stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
  1459. stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
  1460. if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
  1461. goto bad_alloc_table;
  1462. if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
  1463. goto bad_alloc_hotspot_table;
  1464. sentinels_init(mq);
  1465. mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
  1466. mq->next_hotspot_period = jiffies;
  1467. mq->next_cache_period = jiffies;
  1468. mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
  1469. if (!mq->bg_work)
  1470. goto bad_btracker;
  1471. mq->migrations_allowed = migrations_allowed;
  1472. mq->cleaner = cleaner;
  1473. return &mq->policy;
  1474. bad_btracker:
  1475. h_exit(&mq->hotspot_table);
  1476. bad_alloc_hotspot_table:
  1477. h_exit(&mq->table);
  1478. bad_alloc_table:
  1479. free_bitset(mq->cache_hit_bits);
  1480. bad_cache_hit_bits:
  1481. free_bitset(mq->hotspot_hit_bits);
  1482. bad_hotspot_hit_bits:
  1483. space_exit(&mq->es);
  1484. bad_pool_init:
  1485. kfree(mq);
  1486. return NULL;
  1487. }
  1488. static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
  1489. sector_t origin_size,
  1490. sector_t cache_block_size)
  1491. {
  1492. return __smq_create(cache_size, origin_size, cache_block_size,
  1493. false, true, false);
  1494. }
  1495. static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
  1496. sector_t origin_size,
  1497. sector_t cache_block_size)
  1498. {
  1499. return __smq_create(cache_size, origin_size, cache_block_size,
  1500. true, true, false);
  1501. }
  1502. static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
  1503. sector_t origin_size,
  1504. sector_t cache_block_size)
  1505. {
  1506. return __smq_create(cache_size, origin_size, cache_block_size,
  1507. false, false, true);
  1508. }
  1509. /*----------------------------------------------------------------*/
  1510. static struct dm_cache_policy_type smq_policy_type = {
  1511. .name = "smq",
  1512. .version = {2, 0, 0},
  1513. .hint_size = 4,
  1514. .owner = THIS_MODULE,
  1515. .create = smq_create
  1516. };
  1517. static struct dm_cache_policy_type mq_policy_type = {
  1518. .name = "mq",
  1519. .version = {2, 0, 0},
  1520. .hint_size = 4,
  1521. .owner = THIS_MODULE,
  1522. .create = mq_create,
  1523. };
  1524. static struct dm_cache_policy_type cleaner_policy_type = {
  1525. .name = "cleaner",
  1526. .version = {2, 0, 0},
  1527. .hint_size = 4,
  1528. .owner = THIS_MODULE,
  1529. .create = cleaner_create,
  1530. };
  1531. static struct dm_cache_policy_type default_policy_type = {
  1532. .name = "default",
  1533. .version = {2, 0, 0},
  1534. .hint_size = 4,
  1535. .owner = THIS_MODULE,
  1536. .create = smq_create,
  1537. .real = &smq_policy_type
  1538. };
  1539. static int __init smq_init(void)
  1540. {
  1541. int r;
  1542. r = dm_cache_policy_register(&smq_policy_type);
  1543. if (r) {
  1544. DMERR("register failed %d", r);
  1545. return -ENOMEM;
  1546. }
  1547. r = dm_cache_policy_register(&mq_policy_type);
  1548. if (r) {
  1549. DMERR("register failed (as mq) %d", r);
  1550. goto out_mq;
  1551. }
  1552. r = dm_cache_policy_register(&cleaner_policy_type);
  1553. if (r) {
  1554. DMERR("register failed (as cleaner) %d", r);
  1555. goto out_cleaner;
  1556. }
  1557. r = dm_cache_policy_register(&default_policy_type);
  1558. if (r) {
  1559. DMERR("register failed (as default) %d", r);
  1560. goto out_default;
  1561. }
  1562. return 0;
  1563. out_default:
  1564. dm_cache_policy_unregister(&cleaner_policy_type);
  1565. out_cleaner:
  1566. dm_cache_policy_unregister(&mq_policy_type);
  1567. out_mq:
  1568. dm_cache_policy_unregister(&smq_policy_type);
  1569. return -ENOMEM;
  1570. }
  1571. static void __exit smq_exit(void)
  1572. {
  1573. dm_cache_policy_unregister(&cleaner_policy_type);
  1574. dm_cache_policy_unregister(&smq_policy_type);
  1575. dm_cache_policy_unregister(&mq_policy_type);
  1576. dm_cache_policy_unregister(&default_policy_type);
  1577. }
  1578. module_init(smq_init);
  1579. module_exit(smq_exit);
  1580. MODULE_AUTHOR("Joe Thornber <[email protected]>");
  1581. MODULE_LICENSE("GPL");
  1582. MODULE_DESCRIPTION("smq cache policy");
  1583. MODULE_ALIAS("dm-cache-default");
  1584. MODULE_ALIAS("dm-cache-mq");
  1585. MODULE_ALIAS("dm-cache-cleaner");