123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- /* SPDX-License-Identifier: GPL-2.0 */
- #ifndef _LINUX_RECIPROCAL_DIV_H
- #define _LINUX_RECIPROCAL_DIV_H
- #include <linux/types.h>
- /*
- * This algorithm is based on the paper "Division by Invariant
- * Integers Using Multiplication" by Torbjörn Granlund and Peter
- * L. Montgomery.
- *
- * The assembler implementation from Agner Fog, which this code is
- * based on, can be found here:
- * http://www.agner.org/optimize/asmlib.zip
- *
- * This optimization for A/B is helpful if the divisor B is mostly
- * runtime invariant. The reciprocal of B is calculated in the
- * slow-path with reciprocal_value(). The fast-path can then just use
- * a much faster multiplication operation with a variable dividend A
- * to calculate the division A/B.
- */
- struct reciprocal_value {
- u32 m;
- u8 sh1, sh2;
- };
- /* "reciprocal_value" and "reciprocal_divide" together implement the basic
- * version of the algorithm described in Figure 4.1 of the paper.
- */
- struct reciprocal_value reciprocal_value(u32 d);
- static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
- {
- u32 t = (u32)(((u64)a * R.m) >> 32);
- return (t + ((a - t) >> R.sh1)) >> R.sh2;
- }
- struct reciprocal_value_adv {
- u32 m;
- u8 sh, exp;
- bool is_wide_m;
- };
- /* "reciprocal_value_adv" implements the advanced version of the algorithm
- * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
- * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
- * exception case could be easily handled before calling "reciprocal_value_adv".
- *
- * The advanced version requires more complex calculation to get the reciprocal
- * multiplier and other control variables, but then could reduce the required
- * emulation operations.
- *
- * It makes no sense to use this advanced version for host divide emulation,
- * those extra complexities for calculating multiplier etc could completely
- * waive our saving on emulation operations.
- *
- * However, it makes sense to use it for JIT divide code generation for which
- * we are willing to trade performance of JITed code with that of host. As shown
- * by the following pseudo code, the required emulation operations could go down
- * from 6 (the basic version) to 3 or 4.
- *
- * To use the result of "reciprocal_value_adv", suppose we want to calculate
- * n/d, the pseudo C code will be:
- *
- * struct reciprocal_value_adv rvalue;
- * u8 pre_shift, exp;
- *
- * // handle exception case.
- * if (d >= (1U << 31)) {
- * result = n >= d;
- * return;
- * }
- *
- * rvalue = reciprocal_value_adv(d, 32)
- * exp = rvalue.exp;
- * if (rvalue.is_wide_m && !(d & 1)) {
- * // floor(log2(d & (2^32 -d)))
- * pre_shift = fls(d & -d) - 1;
- * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
- * } else {
- * pre_shift = 0;
- * }
- *
- * // code generation starts.
- * if (imm == 1U << exp) {
- * result = n >> exp;
- * } else if (rvalue.is_wide_m) {
- * // pre_shift must be zero when reached here.
- * t = (n * rvalue.m) >> 32;
- * result = n - t;
- * result >>= 1;
- * result += t;
- * result >>= rvalue.sh - 1;
- * } else {
- * if (pre_shift)
- * result = n >> pre_shift;
- * result = ((u64)result * rvalue.m) >> 32;
- * result >>= rvalue.sh;
- * }
- */
- struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
- #endif /* _LINUX_RECIPROCAL_DIV_H */
|