mbcache.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/spinlock.h>
  3. #include <linux/slab.h>
  4. #include <linux/list.h>
  5. #include <linux/list_bl.h>
  6. #include <linux/module.h>
  7. #include <linux/sched.h>
  8. #include <linux/workqueue.h>
  9. #include <linux/mbcache.h>
  10. /*
  11. * Mbcache is a simple key-value store. Keys need not be unique, however
  12. * key-value pairs are expected to be unique (we use this fact in
  13. * mb_cache_entry_delete_or_get()).
  14. *
  15. * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  16. * Ext4 also uses it for deduplication of xattr values stored in inodes.
  17. * They use hash of data as a key and provide a value that may represent a
  18. * block or inode number. That's why keys need not be unique (hash of different
  19. * data may be the same). However user provided value always uniquely
  20. * identifies a cache entry.
  21. *
  22. * We provide functions for creation and removal of entries, search by key,
  23. * and a special "delete entry with given key-value pair" operation. Fixed
  24. * size hash table is used for fast key lookups.
  25. */
  26. struct mb_cache {
  27. /* Hash table of entries */
  28. struct hlist_bl_head *c_hash;
  29. /* log2 of hash table size */
  30. int c_bucket_bits;
  31. /* Maximum entries in cache to avoid degrading hash too much */
  32. unsigned long c_max_entries;
  33. /* Protects c_list, c_entry_count */
  34. spinlock_t c_list_lock;
  35. struct list_head c_list;
  36. /* Number of entries in cache */
  37. unsigned long c_entry_count;
  38. struct shrinker c_shrink;
  39. /* Work for shrinking when the cache has too many entries */
  40. struct work_struct c_shrink_work;
  41. };
  42. static struct kmem_cache *mb_entry_cache;
  43. static unsigned long mb_cache_shrink(struct mb_cache *cache,
  44. unsigned long nr_to_scan);
  45. static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
  46. u32 key)
  47. {
  48. return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
  49. }
  50. /*
  51. * Number of entries to reclaim synchronously when there are too many entries
  52. * in cache
  53. */
  54. #define SYNC_SHRINK_BATCH 64
  55. /*
  56. * mb_cache_entry_create - create entry in cache
  57. * @cache - cache where the entry should be created
  58. * @mask - gfp mask with which the entry should be allocated
  59. * @key - key of the entry
  60. * @value - value of the entry
  61. * @reusable - is the entry reusable by others?
  62. *
  63. * Creates entry in @cache with key @key and value @value. The function returns
  64. * -EBUSY if entry with the same key and value already exists in cache.
  65. * Otherwise 0 is returned.
  66. */
  67. int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
  68. u64 value, bool reusable)
  69. {
  70. struct mb_cache_entry *entry, *dup;
  71. struct hlist_bl_node *dup_node;
  72. struct hlist_bl_head *head;
  73. /* Schedule background reclaim if there are too many entries */
  74. if (cache->c_entry_count >= cache->c_max_entries)
  75. schedule_work(&cache->c_shrink_work);
  76. /* Do some sync reclaim if background reclaim cannot keep up */
  77. if (cache->c_entry_count >= 2*cache->c_max_entries)
  78. mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
  79. entry = kmem_cache_alloc(mb_entry_cache, mask);
  80. if (!entry)
  81. return -ENOMEM;
  82. INIT_LIST_HEAD(&entry->e_list);
  83. /*
  84. * We create entry with two references. One reference is kept by the
  85. * hash table, the other reference is used to protect us from
  86. * mb_cache_entry_delete_or_get() until the entry is fully setup. This
  87. * avoids nesting of cache->c_list_lock into hash table bit locks which
  88. * is problematic for RT.
  89. */
  90. atomic_set(&entry->e_refcnt, 2);
  91. entry->e_key = key;
  92. entry->e_value = value;
  93. entry->e_flags = 0;
  94. if (reusable)
  95. set_bit(MBE_REUSABLE_B, &entry->e_flags);
  96. head = mb_cache_entry_head(cache, key);
  97. hlist_bl_lock(head);
  98. hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
  99. if (dup->e_key == key && dup->e_value == value) {
  100. hlist_bl_unlock(head);
  101. kmem_cache_free(mb_entry_cache, entry);
  102. return -EBUSY;
  103. }
  104. }
  105. hlist_bl_add_head(&entry->e_hash_list, head);
  106. hlist_bl_unlock(head);
  107. spin_lock(&cache->c_list_lock);
  108. list_add_tail(&entry->e_list, &cache->c_list);
  109. cache->c_entry_count++;
  110. spin_unlock(&cache->c_list_lock);
  111. mb_cache_entry_put(cache, entry);
  112. return 0;
  113. }
  114. EXPORT_SYMBOL(mb_cache_entry_create);
  115. void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
  116. {
  117. struct hlist_bl_head *head;
  118. head = mb_cache_entry_head(cache, entry->e_key);
  119. hlist_bl_lock(head);
  120. hlist_bl_del(&entry->e_hash_list);
  121. hlist_bl_unlock(head);
  122. kmem_cache_free(mb_entry_cache, entry);
  123. }
  124. EXPORT_SYMBOL(__mb_cache_entry_free);
  125. /*
  126. * mb_cache_entry_wait_unused - wait to be the last user of the entry
  127. *
  128. * @entry - entry to work on
  129. *
  130. * Wait to be the last user of the entry.
  131. */
  132. void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
  133. {
  134. wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
  135. }
  136. EXPORT_SYMBOL(mb_cache_entry_wait_unused);
  137. static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
  138. struct mb_cache_entry *entry,
  139. u32 key)
  140. {
  141. struct mb_cache_entry *old_entry = entry;
  142. struct hlist_bl_node *node;
  143. struct hlist_bl_head *head;
  144. head = mb_cache_entry_head(cache, key);
  145. hlist_bl_lock(head);
  146. if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
  147. node = entry->e_hash_list.next;
  148. else
  149. node = hlist_bl_first(head);
  150. while (node) {
  151. entry = hlist_bl_entry(node, struct mb_cache_entry,
  152. e_hash_list);
  153. if (entry->e_key == key &&
  154. test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
  155. atomic_inc_not_zero(&entry->e_refcnt))
  156. goto out;
  157. node = node->next;
  158. }
  159. entry = NULL;
  160. out:
  161. hlist_bl_unlock(head);
  162. if (old_entry)
  163. mb_cache_entry_put(cache, old_entry);
  164. return entry;
  165. }
  166. /*
  167. * mb_cache_entry_find_first - find the first reusable entry with the given key
  168. * @cache: cache where we should search
  169. * @key: key to look for
  170. *
  171. * Search in @cache for a reusable entry with key @key. Grabs reference to the
  172. * first reusable entry found and returns the entry.
  173. */
  174. struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
  175. u32 key)
  176. {
  177. return __entry_find(cache, NULL, key);
  178. }
  179. EXPORT_SYMBOL(mb_cache_entry_find_first);
  180. /*
  181. * mb_cache_entry_find_next - find next reusable entry with the same key
  182. * @cache: cache where we should search
  183. * @entry: entry to start search from
  184. *
  185. * Finds next reusable entry in the hash chain which has the same key as @entry.
  186. * If @entry is unhashed (which can happen when deletion of entry races with the
  187. * search), finds the first reusable entry in the hash chain. The function drops
  188. * reference to @entry and returns with a reference to the found entry.
  189. */
  190. struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
  191. struct mb_cache_entry *entry)
  192. {
  193. return __entry_find(cache, entry, entry->e_key);
  194. }
  195. EXPORT_SYMBOL(mb_cache_entry_find_next);
  196. /*
  197. * mb_cache_entry_get - get a cache entry by value (and key)
  198. * @cache - cache we work with
  199. * @key - key
  200. * @value - value
  201. */
  202. struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
  203. u64 value)
  204. {
  205. struct hlist_bl_node *node;
  206. struct hlist_bl_head *head;
  207. struct mb_cache_entry *entry;
  208. head = mb_cache_entry_head(cache, key);
  209. hlist_bl_lock(head);
  210. hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
  211. if (entry->e_key == key && entry->e_value == value &&
  212. atomic_inc_not_zero(&entry->e_refcnt))
  213. goto out;
  214. }
  215. entry = NULL;
  216. out:
  217. hlist_bl_unlock(head);
  218. return entry;
  219. }
  220. EXPORT_SYMBOL(mb_cache_entry_get);
  221. /* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
  222. * @cache - cache we work with
  223. * @key - key
  224. * @value - value
  225. *
  226. * Remove entry from cache @cache with key @key and value @value. The removal
  227. * happens only if the entry is unused. The function returns NULL in case the
  228. * entry was successfully removed or there's no entry in cache. Otherwise the
  229. * function grabs reference of the entry that we failed to delete because it
  230. * still has users and return it.
  231. */
  232. struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
  233. u32 key, u64 value)
  234. {
  235. struct mb_cache_entry *entry;
  236. entry = mb_cache_entry_get(cache, key, value);
  237. if (!entry)
  238. return NULL;
  239. /*
  240. * Drop the ref we got from mb_cache_entry_get() and the initial hash
  241. * ref if we are the last user
  242. */
  243. if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
  244. return entry;
  245. spin_lock(&cache->c_list_lock);
  246. if (!list_empty(&entry->e_list))
  247. list_del_init(&entry->e_list);
  248. cache->c_entry_count--;
  249. spin_unlock(&cache->c_list_lock);
  250. __mb_cache_entry_free(cache, entry);
  251. return NULL;
  252. }
  253. EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
  254. /* mb_cache_entry_touch - cache entry got used
  255. * @cache - cache the entry belongs to
  256. * @entry - entry that got used
  257. *
  258. * Marks entry as used to give hit higher chances of surviving in cache.
  259. */
  260. void mb_cache_entry_touch(struct mb_cache *cache,
  261. struct mb_cache_entry *entry)
  262. {
  263. set_bit(MBE_REFERENCED_B, &entry->e_flags);
  264. }
  265. EXPORT_SYMBOL(mb_cache_entry_touch);
  266. static unsigned long mb_cache_count(struct shrinker *shrink,
  267. struct shrink_control *sc)
  268. {
  269. struct mb_cache *cache = container_of(shrink, struct mb_cache,
  270. c_shrink);
  271. return cache->c_entry_count;
  272. }
  273. /* Shrink number of entries in cache */
  274. static unsigned long mb_cache_shrink(struct mb_cache *cache,
  275. unsigned long nr_to_scan)
  276. {
  277. struct mb_cache_entry *entry;
  278. unsigned long shrunk = 0;
  279. spin_lock(&cache->c_list_lock);
  280. while (nr_to_scan-- && !list_empty(&cache->c_list)) {
  281. entry = list_first_entry(&cache->c_list,
  282. struct mb_cache_entry, e_list);
  283. /* Drop initial hash reference if there is no user */
  284. if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
  285. atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
  286. clear_bit(MBE_REFERENCED_B, &entry->e_flags);
  287. list_move_tail(&entry->e_list, &cache->c_list);
  288. continue;
  289. }
  290. list_del_init(&entry->e_list);
  291. cache->c_entry_count--;
  292. spin_unlock(&cache->c_list_lock);
  293. __mb_cache_entry_free(cache, entry);
  294. shrunk++;
  295. cond_resched();
  296. spin_lock(&cache->c_list_lock);
  297. }
  298. spin_unlock(&cache->c_list_lock);
  299. return shrunk;
  300. }
  301. static unsigned long mb_cache_scan(struct shrinker *shrink,
  302. struct shrink_control *sc)
  303. {
  304. struct mb_cache *cache = container_of(shrink, struct mb_cache,
  305. c_shrink);
  306. return mb_cache_shrink(cache, sc->nr_to_scan);
  307. }
  308. /* We shrink 1/X of the cache when we have too many entries in it */
  309. #define SHRINK_DIVISOR 16
  310. static void mb_cache_shrink_worker(struct work_struct *work)
  311. {
  312. struct mb_cache *cache = container_of(work, struct mb_cache,
  313. c_shrink_work);
  314. mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
  315. }
  316. /*
  317. * mb_cache_create - create cache
  318. * @bucket_bits: log2 of the hash table size
  319. *
  320. * Create cache for keys with 2^bucket_bits hash entries.
  321. */
  322. struct mb_cache *mb_cache_create(int bucket_bits)
  323. {
  324. struct mb_cache *cache;
  325. unsigned long bucket_count = 1UL << bucket_bits;
  326. unsigned long i;
  327. cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
  328. if (!cache)
  329. goto err_out;
  330. cache->c_bucket_bits = bucket_bits;
  331. cache->c_max_entries = bucket_count << 4;
  332. INIT_LIST_HEAD(&cache->c_list);
  333. spin_lock_init(&cache->c_list_lock);
  334. cache->c_hash = kmalloc_array(bucket_count,
  335. sizeof(struct hlist_bl_head),
  336. GFP_KERNEL);
  337. if (!cache->c_hash) {
  338. kfree(cache);
  339. goto err_out;
  340. }
  341. for (i = 0; i < bucket_count; i++)
  342. INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
  343. cache->c_shrink.count_objects = mb_cache_count;
  344. cache->c_shrink.scan_objects = mb_cache_scan;
  345. cache->c_shrink.seeks = DEFAULT_SEEKS;
  346. if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
  347. kfree(cache->c_hash);
  348. kfree(cache);
  349. goto err_out;
  350. }
  351. INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
  352. return cache;
  353. err_out:
  354. return NULL;
  355. }
  356. EXPORT_SYMBOL(mb_cache_create);
  357. /*
  358. * mb_cache_destroy - destroy cache
  359. * @cache: the cache to destroy
  360. *
  361. * Free all entries in cache and cache itself. Caller must make sure nobody
  362. * (except shrinker) can reach @cache when calling this.
  363. */
  364. void mb_cache_destroy(struct mb_cache *cache)
  365. {
  366. struct mb_cache_entry *entry, *next;
  367. unregister_shrinker(&cache->c_shrink);
  368. /*
  369. * We don't bother with any locking. Cache must not be used at this
  370. * point.
  371. */
  372. list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
  373. list_del(&entry->e_list);
  374. WARN_ON(atomic_read(&entry->e_refcnt) != 1);
  375. mb_cache_entry_put(cache, entry);
  376. }
  377. kfree(cache->c_hash);
  378. kfree(cache);
  379. }
  380. EXPORT_SYMBOL(mb_cache_destroy);
  381. static int __init mbcache_init(void)
  382. {
  383. mb_entry_cache = kmem_cache_create("mbcache",
  384. sizeof(struct mb_cache_entry), 0,
  385. SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
  386. if (!mb_entry_cache)
  387. return -ENOMEM;
  388. return 0;
  389. }
  390. static void __exit mbcache_exit(void)
  391. {
  392. kmem_cache_destroy(mb_entry_cache);
  393. }
  394. module_init(mbcache_init)
  395. module_exit(mbcache_exit)
  396. MODULE_AUTHOR("Jan Kara <[email protected]>");
  397. MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
  398. MODULE_LICENSE("GPL");