crush.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. #ifndef CEPH_CRUSH_CRUSH_H
  3. #define CEPH_CRUSH_CRUSH_H
  4. #ifdef __KERNEL__
  5. # include <linux/rbtree.h>
  6. # include <linux/types.h>
  7. #else
  8. # include "crush_compat.h"
  9. #endif
  10. /*
  11. * CRUSH is a pseudo-random data distribution algorithm that
  12. * efficiently distributes input values (typically, data objects)
  13. * across a heterogeneous, structured storage cluster.
  14. *
  15. * The algorithm was originally described in detail in this paper
  16. * (although the algorithm has evolved somewhat since then):
  17. *
  18. * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
  19. *
  20. * LGPL2
  21. */
  22. #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
  23. #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
  24. #define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
  25. #define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
  26. #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
  27. #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
  28. #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
  29. #define CRUSH_ITEM_NONE 0x7fffffff /* no result */
  30. /*
  31. * CRUSH uses user-defined "rules" to describe how inputs should be
  32. * mapped to devices. A rule consists of sequence of steps to perform
  33. * to generate the set of output devices.
  34. */
  35. struct crush_rule_step {
  36. __u32 op;
  37. __s32 arg1;
  38. __s32 arg2;
  39. };
  40. /* step op codes */
  41. enum {
  42. CRUSH_RULE_NOOP = 0,
  43. CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
  44. CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
  45. /* arg2 = type */
  46. CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
  47. CRUSH_RULE_EMIT = 4, /* no args */
  48. CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
  49. CRUSH_RULE_CHOOSELEAF_INDEP = 7,
  50. CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
  51. CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
  52. CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
  53. CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
  54. CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
  55. CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
  56. };
  57. /*
  58. * for specifying choose num (arg1) relative to the max parameter
  59. * passed to do_rule
  60. */
  61. #define CRUSH_CHOOSE_N 0
  62. #define CRUSH_CHOOSE_N_MINUS(x) (-(x))
  63. /*
  64. * The rule mask is used to describe what the rule is intended for.
  65. * Given a ruleset and size of output set, we search through the
  66. * rule list for a matching rule_mask.
  67. */
  68. struct crush_rule_mask {
  69. __u8 ruleset;
  70. __u8 type;
  71. __u8 min_size;
  72. __u8 max_size;
  73. };
  74. struct crush_rule {
  75. __u32 len;
  76. struct crush_rule_mask mask;
  77. struct crush_rule_step steps[];
  78. };
  79. #define crush_rule_size(len) (sizeof(struct crush_rule) + \
  80. (len)*sizeof(struct crush_rule_step))
  81. /*
  82. * A bucket is a named container of other items (either devices or
  83. * other buckets). Items within a bucket are chosen using one of a
  84. * few different algorithms. The table summarizes how the speed of
  85. * each option measures up against mapping stability when items are
  86. * added or removed.
  87. *
  88. * Bucket Alg Speed Additions Removals
  89. * ------------------------------------------------
  90. * uniform O(1) poor poor
  91. * list O(n) optimal poor
  92. * tree O(log n) good good
  93. * straw O(n) better better
  94. * straw2 O(n) optimal optimal
  95. */
  96. enum {
  97. CRUSH_BUCKET_UNIFORM = 1,
  98. CRUSH_BUCKET_LIST = 2,
  99. CRUSH_BUCKET_TREE = 3,
  100. CRUSH_BUCKET_STRAW = 4,
  101. CRUSH_BUCKET_STRAW2 = 5,
  102. };
  103. extern const char *crush_bucket_alg_name(int alg);
  104. /*
  105. * although tree was a legacy algorithm, it has been buggy, so
  106. * exclude it.
  107. */
  108. #define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
  109. (1 << CRUSH_BUCKET_UNIFORM) | \
  110. (1 << CRUSH_BUCKET_LIST) | \
  111. (1 << CRUSH_BUCKET_STRAW))
  112. struct crush_bucket {
  113. __s32 id; /* this'll be negative */
  114. __u16 type; /* non-zero; type=0 is reserved for devices */
  115. __u8 alg; /* one of CRUSH_BUCKET_* */
  116. __u8 hash; /* which hash function to use, CRUSH_HASH_* */
  117. __u32 weight; /* 16-bit fixed point */
  118. __u32 size; /* num items */
  119. __s32 *items;
  120. };
  121. /** @ingroup API
  122. *
  123. * Replacement weights for each item in a bucket. The size of the
  124. * array must be exactly the size of the straw2 bucket, just as the
  125. * item_weights array.
  126. *
  127. */
  128. struct crush_weight_set {
  129. __u32 *weights; /*!< 16.16 fixed point weights
  130. in the same order as items */
  131. __u32 size; /*!< size of the __weights__ array */
  132. };
  133. /** @ingroup API
  134. *
  135. * Replacement weights and ids for a given straw2 bucket, for
  136. * placement purposes.
  137. *
  138. * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
  139. * replacement weights found at __weight_set[N]__ are used instead of
  140. * the weights from __item_weights__. If __N__ is greater than
  141. * __weight_set_size__, the weights found at __weight_set_size-1__ are
  142. * used instead. For instance if __weight_set__ is:
  143. *
  144. * [ [ 0x10000, 0x20000 ], // position 0
  145. * [ 0x20000, 0x40000 ] ] // position 1
  146. *
  147. * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
  148. * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
  149. * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
  150. * etc.
  151. *
  152. */
  153. struct crush_choose_arg {
  154. __s32 *ids; /*!< values to use instead of items */
  155. __u32 ids_size; /*!< size of the __ids__ array */
  156. struct crush_weight_set *weight_set; /*!< weight replacements for
  157. a given position */
  158. __u32 weight_set_size; /*!< size of the __weight_set__ array */
  159. };
  160. /** @ingroup API
  161. *
  162. * Replacement weights and ids for each bucket in the crushmap. The
  163. * __size__ of the __args__ array must be exactly the same as the
  164. * __map->max_buckets__.
  165. *
  166. * The __crush_choose_arg__ at index N will be used when choosing
  167. * an item from the bucket __map->buckets[N]__ bucket, provided it
  168. * is a straw2 bucket.
  169. *
  170. */
  171. struct crush_choose_arg_map {
  172. #ifdef __KERNEL__
  173. struct rb_node node;
  174. s64 choose_args_index;
  175. #endif
  176. struct crush_choose_arg *args; /*!< replacement for each bucket
  177. in the crushmap */
  178. __u32 size; /*!< size of the __args__ array */
  179. };
  180. struct crush_bucket_uniform {
  181. struct crush_bucket h;
  182. __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
  183. };
  184. struct crush_bucket_list {
  185. struct crush_bucket h;
  186. __u32 *item_weights; /* 16-bit fixed point */
  187. __u32 *sum_weights; /* 16-bit fixed point. element i is sum
  188. of weights 0..i, inclusive */
  189. };
  190. struct crush_bucket_tree {
  191. struct crush_bucket h; /* note: h.size is _tree_ size, not number of
  192. actual items */
  193. __u8 num_nodes;
  194. __u32 *node_weights;
  195. };
  196. struct crush_bucket_straw {
  197. struct crush_bucket h;
  198. __u32 *item_weights; /* 16-bit fixed point */
  199. __u32 *straws; /* 16-bit fixed point */
  200. };
  201. struct crush_bucket_straw2 {
  202. struct crush_bucket h;
  203. __u32 *item_weights; /* 16-bit fixed point */
  204. };
  205. /*
  206. * CRUSH map includes all buckets, rules, etc.
  207. */
  208. struct crush_map {
  209. struct crush_bucket **buckets;
  210. struct crush_rule **rules;
  211. __s32 max_buckets;
  212. __u32 max_rules;
  213. __s32 max_devices;
  214. /* choose local retries before re-descent */
  215. __u32 choose_local_tries;
  216. /* choose local attempts using a fallback permutation before
  217. * re-descent */
  218. __u32 choose_local_fallback_tries;
  219. /* choose attempts before giving up */
  220. __u32 choose_total_tries;
  221. /* attempt chooseleaf inner descent once for firstn mode; on
  222. * reject retry outer descent. Note that this does *not*
  223. * apply to a collision: in that case we will retry as we used
  224. * to. */
  225. __u32 chooseleaf_descend_once;
  226. /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
  227. * bits. a value of 1 is best for new clusters. for legacy clusters
  228. * that want to limit reshuffling, a value of 3 or 4 will make the
  229. * mappings line up a bit better with previous mappings. */
  230. __u8 chooseleaf_vary_r;
  231. /* if true, it makes chooseleaf firstn to return stable results (if
  232. * no local retry) so that data migrations would be optimal when some
  233. * device fails. */
  234. __u8 chooseleaf_stable;
  235. /*
  236. * This value is calculated after decode or construction by
  237. * the builder. It is exposed here (rather than having a
  238. * 'build CRUSH working space' function) so that callers can
  239. * reserve a static buffer, allocate space on the stack, or
  240. * otherwise avoid calling into the heap allocator if they
  241. * want to. The size of the working space depends on the map,
  242. * while the size of the scratch vector passed to the mapper
  243. * depends on the size of the desired result set.
  244. *
  245. * Nothing stops the caller from allocating both in one swell
  246. * foop and passing in two points, though.
  247. */
  248. size_t working_size;
  249. #ifndef __KERNEL__
  250. /*
  251. * version 0 (original) of straw_calc has various flaws. version 1
  252. * fixes a few of them.
  253. */
  254. __u8 straw_calc_version;
  255. /*
  256. * allowed bucket algs is a bitmask, here the bit positions
  257. * are CRUSH_BUCKET_*. note that these are *bits* and
  258. * CRUSH_BUCKET_* values are not, so we need to or together (1
  259. * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
  260. * minimize confusion (bucket type values start at 1).
  261. */
  262. __u32 allowed_bucket_algs;
  263. __u32 *choose_tries;
  264. #else
  265. /* device/bucket type id -> type name (CrushWrapper::type_map) */
  266. struct rb_root type_names;
  267. /* device/bucket id -> name (CrushWrapper::name_map) */
  268. struct rb_root names;
  269. /* CrushWrapper::choose_args */
  270. struct rb_root choose_args;
  271. #endif
  272. };
  273. /* crush.c */
  274. extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
  275. extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
  276. extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
  277. extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
  278. extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
  279. extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
  280. extern void crush_destroy_bucket(struct crush_bucket *b);
  281. extern void crush_destroy_rule(struct crush_rule *r);
  282. extern void crush_destroy(struct crush_map *map);
  283. static inline int crush_calc_tree_node(int i)
  284. {
  285. return ((i+1) << 1)-1;
  286. }
  287. /*
  288. * These data structures are private to the CRUSH implementation. They
  289. * are exposed in this header file because builder needs their
  290. * definitions to calculate the total working size.
  291. *
  292. * Moving this out of the crush map allow us to treat the CRUSH map as
  293. * immutable within the mapper and removes the requirement for a CRUSH
  294. * map lock.
  295. */
  296. struct crush_work_bucket {
  297. __u32 perm_x; /* @x for which *perm is defined */
  298. __u32 perm_n; /* num elements of *perm that are permuted/defined */
  299. __u32 *perm; /* Permutation of the bucket's items */
  300. };
  301. struct crush_work {
  302. struct crush_work_bucket **work; /* Per-bucket working store */
  303. #ifdef __KERNEL__
  304. struct list_head item;
  305. #endif
  306. };
  307. #ifdef __KERNEL__
  308. /* osdmap.c */
  309. void clear_crush_names(struct rb_root *root);
  310. void clear_choose_args(struct crush_map *c);
  311. #endif
  312. #endif