lpm_trie.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Longest prefix match list implementation
  4. *
  5. * Copyright (c) 2016,2017 Daniel Mack
  6. * Copyright (c) 2016 David Herrmann
  7. */
  8. #include <linux/bpf.h>
  9. #include <linux/btf.h>
  10. #include <linux/err.h>
  11. #include <linux/slab.h>
  12. #include <linux/spinlock.h>
  13. #include <linux/vmalloc.h>
  14. #include <net/ipv6.h>
  15. #include <uapi/linux/btf.h>
  16. #include <linux/btf_ids.h>
  17. /* Intermediate node */
  18. #define LPM_TREE_NODE_FLAG_IM BIT(0)
  19. struct lpm_trie_node;
  20. struct lpm_trie_node {
  21. struct rcu_head rcu;
  22. struct lpm_trie_node __rcu *child[2];
  23. u32 prefixlen;
  24. u32 flags;
  25. u8 data[];
  26. };
  27. struct lpm_trie {
  28. struct bpf_map map;
  29. struct lpm_trie_node __rcu *root;
  30. size_t n_entries;
  31. size_t max_prefixlen;
  32. size_t data_size;
  33. spinlock_t lock;
  34. };
  35. /* This trie implements a longest prefix match algorithm that can be used to
  36. * match IP addresses to a stored set of ranges.
  37. *
  38. * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
  39. * interpreted as big endian, so data[0] stores the most significant byte.
  40. *
  41. * Match ranges are internally stored in instances of struct lpm_trie_node
  42. * which each contain their prefix length as well as two pointers that may
  43. * lead to more nodes containing more specific matches. Each node also stores
  44. * a value that is defined by and returned to userspace via the update_elem
  45. * and lookup functions.
  46. *
  47. * For instance, let's start with a trie that was created with a prefix length
  48. * of 32, so it can be used for IPv4 addresses, and one single element that
  49. * matches 192.168.0.0/16. The data array would hence contain
  50. * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
  51. * stick to IP-address notation for readability though.
  52. *
  53. * As the trie is empty initially, the new node (1) will be places as root
  54. * node, denoted as (R) in the example below. As there are no other node, both
  55. * child pointers are %NULL.
  56. *
  57. * +----------------+
  58. * | (1) (R) |
  59. * | 192.168.0.0/16 |
  60. * | value: 1 |
  61. * | [0] [1] |
  62. * +----------------+
  63. *
  64. * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
  65. * a node with the same data and a smaller prefix (ie, a less specific one),
  66. * node (2) will become a child of (1). In child index depends on the next bit
  67. * that is outside of what (1) matches, and that bit is 0, so (2) will be
  68. * child[0] of (1):
  69. *
  70. * +----------------+
  71. * | (1) (R) |
  72. * | 192.168.0.0/16 |
  73. * | value: 1 |
  74. * | [0] [1] |
  75. * +----------------+
  76. * |
  77. * +----------------+
  78. * | (2) |
  79. * | 192.168.0.0/24 |
  80. * | value: 2 |
  81. * | [0] [1] |
  82. * +----------------+
  83. *
  84. * The child[1] slot of (1) could be filled with another node which has bit #17
  85. * (the next bit after the ones that (1) matches on) set to 1. For instance,
  86. * 192.168.128.0/24:
  87. *
  88. * +----------------+
  89. * | (1) (R) |
  90. * | 192.168.0.0/16 |
  91. * | value: 1 |
  92. * | [0] [1] |
  93. * +----------------+
  94. * | |
  95. * +----------------+ +------------------+
  96. * | (2) | | (3) |
  97. * | 192.168.0.0/24 | | 192.168.128.0/24 |
  98. * | value: 2 | | value: 3 |
  99. * | [0] [1] | | [0] [1] |
  100. * +----------------+ +------------------+
  101. *
  102. * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
  103. * it, node (1) is looked at first, and because (4) of the semantics laid out
  104. * above (bit #17 is 0), it would normally be attached to (1) as child[0].
  105. * However, that slot is already allocated, so a new node is needed in between.
  106. * That node does not have a value attached to it and it will never be
  107. * returned to users as result of a lookup. It is only there to differentiate
  108. * the traversal further. It will get a prefix as wide as necessary to
  109. * distinguish its two children:
  110. *
  111. * +----------------+
  112. * | (1) (R) |
  113. * | 192.168.0.0/16 |
  114. * | value: 1 |
  115. * | [0] [1] |
  116. * +----------------+
  117. * | |
  118. * +----------------+ +------------------+
  119. * | (4) (I) | | (3) |
  120. * | 192.168.0.0/23 | | 192.168.128.0/24 |
  121. * | value: --- | | value: 3 |
  122. * | [0] [1] | | [0] [1] |
  123. * +----------------+ +------------------+
  124. * | |
  125. * +----------------+ +----------------+
  126. * | (2) | | (5) |
  127. * | 192.168.0.0/24 | | 192.168.1.0/24 |
  128. * | value: 2 | | value: 5 |
  129. * | [0] [1] | | [0] [1] |
  130. * +----------------+ +----------------+
  131. *
  132. * 192.168.1.1/32 would be a child of (5) etc.
  133. *
  134. * An intermediate node will be turned into a 'real' node on demand. In the
  135. * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
  136. *
  137. * A fully populated trie would have a height of 32 nodes, as the trie was
  138. * created with a prefix length of 32.
  139. *
  140. * The lookup starts at the root node. If the current node matches and if there
  141. * is a child that can be used to become more specific, the trie is traversed
  142. * downwards. The last node in the traversal that is a non-intermediate one is
  143. * returned.
  144. */
  145. static inline int extract_bit(const u8 *data, size_t index)
  146. {
  147. return !!(data[index / 8] & (1 << (7 - (index % 8))));
  148. }
  149. /**
  150. * longest_prefix_match() - determine the longest prefix
  151. * @trie: The trie to get internal sizes from
  152. * @node: The node to operate on
  153. * @key: The key to compare to @node
  154. *
  155. * Determine the longest prefix of @node that matches the bits in @key.
  156. */
  157. static size_t longest_prefix_match(const struct lpm_trie *trie,
  158. const struct lpm_trie_node *node,
  159. const struct bpf_lpm_trie_key *key)
  160. {
  161. u32 limit = min(node->prefixlen, key->prefixlen);
  162. u32 prefixlen = 0, i = 0;
  163. BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
  164. BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32));
  165. #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
  166. /* data_size >= 16 has very small probability.
  167. * We do not use a loop for optimal code generation.
  168. */
  169. if (trie->data_size >= 8) {
  170. u64 diff = be64_to_cpu(*(__be64 *)node->data ^
  171. *(__be64 *)key->data);
  172. prefixlen = 64 - fls64(diff);
  173. if (prefixlen >= limit)
  174. return limit;
  175. if (diff)
  176. return prefixlen;
  177. i = 8;
  178. }
  179. #endif
  180. while (trie->data_size >= i + 4) {
  181. u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
  182. *(__be32 *)&key->data[i]);
  183. prefixlen += 32 - fls(diff);
  184. if (prefixlen >= limit)
  185. return limit;
  186. if (diff)
  187. return prefixlen;
  188. i += 4;
  189. }
  190. if (trie->data_size >= i + 2) {
  191. u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
  192. *(__be16 *)&key->data[i]);
  193. prefixlen += 16 - fls(diff);
  194. if (prefixlen >= limit)
  195. return limit;
  196. if (diff)
  197. return prefixlen;
  198. i += 2;
  199. }
  200. if (trie->data_size >= i + 1) {
  201. prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
  202. if (prefixlen >= limit)
  203. return limit;
  204. }
  205. return prefixlen;
  206. }
  207. /* Called from syscall or from eBPF program */
  208. static void *trie_lookup_elem(struct bpf_map *map, void *_key)
  209. {
  210. struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
  211. struct lpm_trie_node *node, *found = NULL;
  212. struct bpf_lpm_trie_key *key = _key;
  213. /* Start walking the trie from the root node ... */
  214. for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
  215. node;) {
  216. unsigned int next_bit;
  217. size_t matchlen;
  218. /* Determine the longest prefix of @node that matches @key.
  219. * If it's the maximum possible prefix for this trie, we have
  220. * an exact match and can return it directly.
  221. */
  222. matchlen = longest_prefix_match(trie, node, key);
  223. if (matchlen == trie->max_prefixlen) {
  224. found = node;
  225. break;
  226. }
  227. /* If the number of bits that match is smaller than the prefix
  228. * length of @node, bail out and return the node we have seen
  229. * last in the traversal (ie, the parent).
  230. */
  231. if (matchlen < node->prefixlen)
  232. break;
  233. /* Consider this node as return candidate unless it is an
  234. * artificially added intermediate one.
  235. */
  236. if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
  237. found = node;
  238. /* If the node match is fully satisfied, let's see if we can
  239. * become more specific. Determine the next bit in the key and
  240. * traverse down.
  241. */
  242. next_bit = extract_bit(key->data, node->prefixlen);
  243. node = rcu_dereference_check(node->child[next_bit],
  244. rcu_read_lock_bh_held());
  245. }
  246. if (!found)
  247. return NULL;
  248. return found->data + trie->data_size;
  249. }
  250. static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
  251. const void *value)
  252. {
  253. struct lpm_trie_node *node;
  254. size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
  255. if (value)
  256. size += trie->map.value_size;
  257. node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
  258. trie->map.numa_node);
  259. if (!node)
  260. return NULL;
  261. node->flags = 0;
  262. if (value)
  263. memcpy(node->data + trie->data_size, value,
  264. trie->map.value_size);
  265. return node;
  266. }
  267. /* Called from syscall or from eBPF program */
  268. static int trie_update_elem(struct bpf_map *map,
  269. void *_key, void *value, u64 flags)
  270. {
  271. struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
  272. struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
  273. struct lpm_trie_node __rcu **slot;
  274. struct bpf_lpm_trie_key *key = _key;
  275. unsigned long irq_flags;
  276. unsigned int next_bit;
  277. size_t matchlen = 0;
  278. int ret = 0;
  279. if (unlikely(flags > BPF_EXIST))
  280. return -EINVAL;
  281. if (key->prefixlen > trie->max_prefixlen)
  282. return -EINVAL;
  283. spin_lock_irqsave(&trie->lock, irq_flags);
  284. /* Allocate and fill a new node */
  285. if (trie->n_entries == trie->map.max_entries) {
  286. ret = -ENOSPC;
  287. goto out;
  288. }
  289. new_node = lpm_trie_node_alloc(trie, value);
  290. if (!new_node) {
  291. ret = -ENOMEM;
  292. goto out;
  293. }
  294. trie->n_entries++;
  295. new_node->prefixlen = key->prefixlen;
  296. RCU_INIT_POINTER(new_node->child[0], NULL);
  297. RCU_INIT_POINTER(new_node->child[1], NULL);
  298. memcpy(new_node->data, key->data, trie->data_size);
  299. /* Now find a slot to attach the new node. To do that, walk the tree
  300. * from the root and match as many bits as possible for each node until
  301. * we either find an empty slot or a slot that needs to be replaced by
  302. * an intermediate node.
  303. */
  304. slot = &trie->root;
  305. while ((node = rcu_dereference_protected(*slot,
  306. lockdep_is_held(&trie->lock)))) {
  307. matchlen = longest_prefix_match(trie, node, key);
  308. if (node->prefixlen != matchlen ||
  309. node->prefixlen == key->prefixlen ||
  310. node->prefixlen == trie->max_prefixlen)
  311. break;
  312. next_bit = extract_bit(key->data, node->prefixlen);
  313. slot = &node->child[next_bit];
  314. }
  315. /* If the slot is empty (a free child pointer or an empty root),
  316. * simply assign the @new_node to that slot and be done.
  317. */
  318. if (!node) {
  319. rcu_assign_pointer(*slot, new_node);
  320. goto out;
  321. }
  322. /* If the slot we picked already exists, replace it with @new_node
  323. * which already has the correct data array set.
  324. */
  325. if (node->prefixlen == matchlen) {
  326. new_node->child[0] = node->child[0];
  327. new_node->child[1] = node->child[1];
  328. if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
  329. trie->n_entries--;
  330. rcu_assign_pointer(*slot, new_node);
  331. kfree_rcu(node, rcu);
  332. goto out;
  333. }
  334. /* If the new node matches the prefix completely, it must be inserted
  335. * as an ancestor. Simply insert it between @node and *@slot.
  336. */
  337. if (matchlen == key->prefixlen) {
  338. next_bit = extract_bit(node->data, matchlen);
  339. rcu_assign_pointer(new_node->child[next_bit], node);
  340. rcu_assign_pointer(*slot, new_node);
  341. goto out;
  342. }
  343. im_node = lpm_trie_node_alloc(trie, NULL);
  344. if (!im_node) {
  345. ret = -ENOMEM;
  346. goto out;
  347. }
  348. im_node->prefixlen = matchlen;
  349. im_node->flags |= LPM_TREE_NODE_FLAG_IM;
  350. memcpy(im_node->data, node->data, trie->data_size);
  351. /* Now determine which child to install in which slot */
  352. if (extract_bit(key->data, matchlen)) {
  353. rcu_assign_pointer(im_node->child[0], node);
  354. rcu_assign_pointer(im_node->child[1], new_node);
  355. } else {
  356. rcu_assign_pointer(im_node->child[0], new_node);
  357. rcu_assign_pointer(im_node->child[1], node);
  358. }
  359. /* Finally, assign the intermediate node to the determined slot */
  360. rcu_assign_pointer(*slot, im_node);
  361. out:
  362. if (ret) {
  363. if (new_node)
  364. trie->n_entries--;
  365. kfree(new_node);
  366. kfree(im_node);
  367. }
  368. spin_unlock_irqrestore(&trie->lock, irq_flags);
  369. return ret;
  370. }
  371. /* Called from syscall or from eBPF program */
  372. static int trie_delete_elem(struct bpf_map *map, void *_key)
  373. {
  374. struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
  375. struct bpf_lpm_trie_key *key = _key;
  376. struct lpm_trie_node __rcu **trim, **trim2;
  377. struct lpm_trie_node *node, *parent;
  378. unsigned long irq_flags;
  379. unsigned int next_bit;
  380. size_t matchlen = 0;
  381. int ret = 0;
  382. if (key->prefixlen > trie->max_prefixlen)
  383. return -EINVAL;
  384. spin_lock_irqsave(&trie->lock, irq_flags);
  385. /* Walk the tree looking for an exact key/length match and keeping
  386. * track of the path we traverse. We will need to know the node
  387. * we wish to delete, and the slot that points to the node we want
  388. * to delete. We may also need to know the nodes parent and the
  389. * slot that contains it.
  390. */
  391. trim = &trie->root;
  392. trim2 = trim;
  393. parent = NULL;
  394. while ((node = rcu_dereference_protected(
  395. *trim, lockdep_is_held(&trie->lock)))) {
  396. matchlen = longest_prefix_match(trie, node, key);
  397. if (node->prefixlen != matchlen ||
  398. node->prefixlen == key->prefixlen)
  399. break;
  400. parent = node;
  401. trim2 = trim;
  402. next_bit = extract_bit(key->data, node->prefixlen);
  403. trim = &node->child[next_bit];
  404. }
  405. if (!node || node->prefixlen != key->prefixlen ||
  406. node->prefixlen != matchlen ||
  407. (node->flags & LPM_TREE_NODE_FLAG_IM)) {
  408. ret = -ENOENT;
  409. goto out;
  410. }
  411. trie->n_entries--;
  412. /* If the node we are removing has two children, simply mark it
  413. * as intermediate and we are done.
  414. */
  415. if (rcu_access_pointer(node->child[0]) &&
  416. rcu_access_pointer(node->child[1])) {
  417. node->flags |= LPM_TREE_NODE_FLAG_IM;
  418. goto out;
  419. }
  420. /* If the parent of the node we are about to delete is an intermediate
  421. * node, and the deleted node doesn't have any children, we can delete
  422. * the intermediate parent as well and promote its other child
  423. * up the tree. Doing this maintains the invariant that all
  424. * intermediate nodes have exactly 2 children and that there are no
  425. * unnecessary intermediate nodes in the tree.
  426. */
  427. if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
  428. !node->child[0] && !node->child[1]) {
  429. if (node == rcu_access_pointer(parent->child[0]))
  430. rcu_assign_pointer(
  431. *trim2, rcu_access_pointer(parent->child[1]));
  432. else
  433. rcu_assign_pointer(
  434. *trim2, rcu_access_pointer(parent->child[0]));
  435. kfree_rcu(parent, rcu);
  436. kfree_rcu(node, rcu);
  437. goto out;
  438. }
  439. /* The node we are removing has either zero or one child. If there
  440. * is a child, move it into the removed node's slot then delete
  441. * the node. Otherwise just clear the slot and delete the node.
  442. */
  443. if (node->child[0])
  444. rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
  445. else if (node->child[1])
  446. rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
  447. else
  448. RCU_INIT_POINTER(*trim, NULL);
  449. kfree_rcu(node, rcu);
  450. out:
  451. spin_unlock_irqrestore(&trie->lock, irq_flags);
  452. return ret;
  453. }
  454. #define LPM_DATA_SIZE_MAX 256
  455. #define LPM_DATA_SIZE_MIN 1
  456. #define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
  457. sizeof(struct lpm_trie_node))
  458. #define LPM_VAL_SIZE_MIN 1
  459. #define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
  460. #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
  461. #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
  462. #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
  463. BPF_F_ACCESS_MASK)
  464. static struct bpf_map *trie_alloc(union bpf_attr *attr)
  465. {
  466. struct lpm_trie *trie;
  467. if (!bpf_capable())
  468. return ERR_PTR(-EPERM);
  469. /* check sanity of attributes */
  470. if (attr->max_entries == 0 ||
  471. !(attr->map_flags & BPF_F_NO_PREALLOC) ||
  472. attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
  473. !bpf_map_flags_access_ok(attr->map_flags) ||
  474. attr->key_size < LPM_KEY_SIZE_MIN ||
  475. attr->key_size > LPM_KEY_SIZE_MAX ||
  476. attr->value_size < LPM_VAL_SIZE_MIN ||
  477. attr->value_size > LPM_VAL_SIZE_MAX)
  478. return ERR_PTR(-EINVAL);
  479. trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
  480. if (!trie)
  481. return ERR_PTR(-ENOMEM);
  482. /* copy mandatory map attributes */
  483. bpf_map_init_from_attr(&trie->map, attr);
  484. trie->data_size = attr->key_size -
  485. offsetof(struct bpf_lpm_trie_key, data);
  486. trie->max_prefixlen = trie->data_size * 8;
  487. spin_lock_init(&trie->lock);
  488. return &trie->map;
  489. }
  490. static void trie_free(struct bpf_map *map)
  491. {
  492. struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
  493. struct lpm_trie_node __rcu **slot;
  494. struct lpm_trie_node *node;
  495. /* Always start at the root and walk down to a node that has no
  496. * children. Then free that node, nullify its reference in the parent
  497. * and start over.
  498. */
  499. for (;;) {
  500. slot = &trie->root;
  501. for (;;) {
  502. node = rcu_dereference_protected(*slot, 1);
  503. if (!node)
  504. goto out;
  505. if (rcu_access_pointer(node->child[0])) {
  506. slot = &node->child[0];
  507. continue;
  508. }
  509. if (rcu_access_pointer(node->child[1])) {
  510. slot = &node->child[1];
  511. continue;
  512. }
  513. kfree(node);
  514. RCU_INIT_POINTER(*slot, NULL);
  515. break;
  516. }
  517. }
  518. out:
  519. bpf_map_area_free(trie);
  520. }
  521. static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
  522. {
  523. struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
  524. struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
  525. struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
  526. struct lpm_trie_node **node_stack = NULL;
  527. int err = 0, stack_ptr = -1;
  528. unsigned int next_bit;
  529. size_t matchlen;
  530. /* The get_next_key follows postorder. For the 4 node example in
  531. * the top of this file, the trie_get_next_key() returns the following
  532. * one after another:
  533. * 192.168.0.0/24
  534. * 192.168.1.0/24
  535. * 192.168.128.0/24
  536. * 192.168.0.0/16
  537. *
  538. * The idea is to return more specific keys before less specific ones.
  539. */
  540. /* Empty trie */
  541. search_root = rcu_dereference(trie->root);
  542. if (!search_root)
  543. return -ENOENT;
  544. /* For invalid key, find the leftmost node in the trie */
  545. if (!key || key->prefixlen > trie->max_prefixlen)
  546. goto find_leftmost;
  547. node_stack = kmalloc_array(trie->max_prefixlen,
  548. sizeof(struct lpm_trie_node *),
  549. GFP_ATOMIC | __GFP_NOWARN);
  550. if (!node_stack)
  551. return -ENOMEM;
  552. /* Try to find the exact node for the given key */
  553. for (node = search_root; node;) {
  554. node_stack[++stack_ptr] = node;
  555. matchlen = longest_prefix_match(trie, node, key);
  556. if (node->prefixlen != matchlen ||
  557. node->prefixlen == key->prefixlen)
  558. break;
  559. next_bit = extract_bit(key->data, node->prefixlen);
  560. node = rcu_dereference(node->child[next_bit]);
  561. }
  562. if (!node || node->prefixlen != key->prefixlen ||
  563. (node->flags & LPM_TREE_NODE_FLAG_IM))
  564. goto find_leftmost;
  565. /* The node with the exactly-matching key has been found,
  566. * find the first node in postorder after the matched node.
  567. */
  568. node = node_stack[stack_ptr];
  569. while (stack_ptr > 0) {
  570. parent = node_stack[stack_ptr - 1];
  571. if (rcu_dereference(parent->child[0]) == node) {
  572. search_root = rcu_dereference(parent->child[1]);
  573. if (search_root)
  574. goto find_leftmost;
  575. }
  576. if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
  577. next_node = parent;
  578. goto do_copy;
  579. }
  580. node = parent;
  581. stack_ptr--;
  582. }
  583. /* did not find anything */
  584. err = -ENOENT;
  585. goto free_stack;
  586. find_leftmost:
  587. /* Find the leftmost non-intermediate node, all intermediate nodes
  588. * have exact two children, so this function will never return NULL.
  589. */
  590. for (node = search_root; node;) {
  591. if (node->flags & LPM_TREE_NODE_FLAG_IM) {
  592. node = rcu_dereference(node->child[0]);
  593. } else {
  594. next_node = node;
  595. node = rcu_dereference(node->child[0]);
  596. if (!node)
  597. node = rcu_dereference(next_node->child[1]);
  598. }
  599. }
  600. do_copy:
  601. next_key->prefixlen = next_node->prefixlen;
  602. memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
  603. next_node->data, trie->data_size);
  604. free_stack:
  605. kfree(node_stack);
  606. return err;
  607. }
  608. static int trie_check_btf(const struct bpf_map *map,
  609. const struct btf *btf,
  610. const struct btf_type *key_type,
  611. const struct btf_type *value_type)
  612. {
  613. /* Keys must have struct bpf_lpm_trie_key embedded. */
  614. return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
  615. -EINVAL : 0;
  616. }
  617. BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
  618. const struct bpf_map_ops trie_map_ops = {
  619. .map_meta_equal = bpf_map_meta_equal,
  620. .map_alloc = trie_alloc,
  621. .map_free = trie_free,
  622. .map_get_next_key = trie_get_next_key,
  623. .map_lookup_elem = trie_lookup_elem,
  624. .map_update_elem = trie_update_elem,
  625. .map_delete_elem = trie_delete_elem,
  626. .map_lookup_batch = generic_map_lookup_batch,
  627. .map_update_batch = generic_map_update_batch,
  628. .map_delete_batch = generic_map_delete_batch,
  629. .map_check_btf = trie_check_btf,
  630. .map_btf_id = &trie_map_btf_ids[0],
  631. };