reciprocal_div.h 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. #ifndef _LINUX_RECIPROCAL_DIV_H
  3. #define _LINUX_RECIPROCAL_DIV_H
  4. #include <linux/types.h>
  5. /*
  6. * This algorithm is based on the paper "Division by Invariant
  7. * Integers Using Multiplication" by Torbjörn Granlund and Peter
  8. * L. Montgomery.
  9. *
  10. * The assembler implementation from Agner Fog, which this code is
  11. * based on, can be found here:
  12. * http://www.agner.org/optimize/asmlib.zip
  13. *
  14. * This optimization for A/B is helpful if the divisor B is mostly
  15. * runtime invariant. The reciprocal of B is calculated in the
  16. * slow-path with reciprocal_value(). The fast-path can then just use
  17. * a much faster multiplication operation with a variable dividend A
  18. * to calculate the division A/B.
  19. */
  20. struct reciprocal_value {
  21. u32 m;
  22. u8 sh1, sh2;
  23. };
  24. /* "reciprocal_value" and "reciprocal_divide" together implement the basic
  25. * version of the algorithm described in Figure 4.1 of the paper.
  26. */
  27. struct reciprocal_value reciprocal_value(u32 d);
  28. static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
  29. {
  30. u32 t = (u32)(((u64)a * R.m) >> 32);
  31. return (t + ((a - t) >> R.sh1)) >> R.sh2;
  32. }
  33. struct reciprocal_value_adv {
  34. u32 m;
  35. u8 sh, exp;
  36. bool is_wide_m;
  37. };
  38. /* "reciprocal_value_adv" implements the advanced version of the algorithm
  39. * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
  40. * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
  41. * exception case could be easily handled before calling "reciprocal_value_adv".
  42. *
  43. * The advanced version requires more complex calculation to get the reciprocal
  44. * multiplier and other control variables, but then could reduce the required
  45. * emulation operations.
  46. *
  47. * It makes no sense to use this advanced version for host divide emulation,
  48. * those extra complexities for calculating multiplier etc could completely
  49. * waive our saving on emulation operations.
  50. *
  51. * However, it makes sense to use it for JIT divide code generation for which
  52. * we are willing to trade performance of JITed code with that of host. As shown
  53. * by the following pseudo code, the required emulation operations could go down
  54. * from 6 (the basic version) to 3 or 4.
  55. *
  56. * To use the result of "reciprocal_value_adv", suppose we want to calculate
  57. * n/d, the pseudo C code will be:
  58. *
  59. * struct reciprocal_value_adv rvalue;
  60. * u8 pre_shift, exp;
  61. *
  62. * // handle exception case.
  63. * if (d >= (1U << 31)) {
  64. * result = n >= d;
  65. * return;
  66. * }
  67. *
  68. * rvalue = reciprocal_value_adv(d, 32)
  69. * exp = rvalue.exp;
  70. * if (rvalue.is_wide_m && !(d & 1)) {
  71. * // floor(log2(d & (2^32 -d)))
  72. * pre_shift = fls(d & -d) - 1;
  73. * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  74. * } else {
  75. * pre_shift = 0;
  76. * }
  77. *
  78. * // code generation starts.
  79. * if (imm == 1U << exp) {
  80. * result = n >> exp;
  81. * } else if (rvalue.is_wide_m) {
  82. * // pre_shift must be zero when reached here.
  83. * t = (n * rvalue.m) >> 32;
  84. * result = n - t;
  85. * result >>= 1;
  86. * result += t;
  87. * result >>= rvalue.sh - 1;
  88. * } else {
  89. * if (pre_shift)
  90. * result = n >> pre_shift;
  91. * result = ((u64)result * rvalue.m) >> 32;
  92. * result >>= rvalue.sh;
  93. * }
  94. */
  95. struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
  96. #endif /* _LINUX_RECIPROCAL_DIV_H */