mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
3.3 KiB
104 lines
3.3 KiB
/* SPDX-License-Identifier: GPL-2.0 */ |
|
#ifndef _LINUX_RECIPROCAL_DIV_H |
|
#define _LINUX_RECIPROCAL_DIV_H |
|
|
|
#include <linux/types.h> |
|
|
|
/* |
|
* This algorithm is based on the paper "Division by Invariant |
|
* Integers Using Multiplication" by Torbjörn Granlund and Peter |
|
* L. Montgomery. |
|
* |
|
* The assembler implementation from Agner Fog, which this code is |
|
* based on, can be found here: |
|
* http://www.agner.org/optimize/asmlib.zip |
|
* |
|
* This optimization for A/B is helpful if the divisor B is mostly |
|
* runtime invariant. The reciprocal of B is calculated in the |
|
* slow-path with reciprocal_value(). The fast-path can then just use |
|
* a much faster multiplication operation with a variable dividend A |
|
* to calculate the division A/B. |
|
*/ |
|
|
|
struct reciprocal_value { |
|
u32 m; |
|
u8 sh1, sh2; |
|
}; |
|
|
|
/* "reciprocal_value" and "reciprocal_divide" together implement the basic |
|
* version of the algorithm described in Figure 4.1 of the paper. |
|
*/ |
|
struct reciprocal_value reciprocal_value(u32 d); |
|
|
|
static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) |
|
{ |
|
u32 t = (u32)(((u64)a * R.m) >> 32); |
|
return (t + ((a - t) >> R.sh1)) >> R.sh2; |
|
} |
|
|
|
struct reciprocal_value_adv { |
|
u32 m; |
|
u8 sh, exp; |
|
bool is_wide_m; |
|
}; |
|
|
|
/* "reciprocal_value_adv" implements the advanced version of the algorithm |
|
* described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose |
|
* ceil(log2(d)) result will be 32 which then requires u128 divide on host. The |
|
* exception case could be easily handled before calling "reciprocal_value_adv". |
|
* |
|
* The advanced version requires more complex calculation to get the reciprocal |
|
* multiplier and other control variables, but then could reduce the required |
|
* emulation operations. |
|
* |
|
* It makes no sense to use this advanced version for host divide emulation, |
|
* those extra complexities for calculating multiplier etc could completely |
|
* waive our saving on emulation operations. |
|
* |
|
* However, it makes sense to use it for JIT divide code generation for which |
|
* we are willing to trade performance of JITed code with that of host. As shown |
|
* by the following pseudo code, the required emulation operations could go down |
|
* from 6 (the basic version) to 3 or 4. |
|
* |
|
* To use the result of "reciprocal_value_adv", suppose we want to calculate |
|
* n/d, the pseudo C code will be: |
|
* |
|
* struct reciprocal_value_adv rvalue; |
|
* u8 pre_shift, exp; |
|
* |
|
* // handle exception case. |
|
* if (d >= (1U << 31)) { |
|
* result = n >= d; |
|
* return; |
|
* } |
|
* |
|
* rvalue = reciprocal_value_adv(d, 32) |
|
* exp = rvalue.exp; |
|
* if (rvalue.is_wide_m && !(d & 1)) { |
|
* // floor(log2(d & (2^32 -d))) |
|
* pre_shift = fls(d & -d) - 1; |
|
* rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); |
|
* } else { |
|
* pre_shift = 0; |
|
* } |
|
* |
|
* // code generation starts. |
|
* if (imm == 1U << exp) { |
|
* result = n >> exp; |
|
* } else if (rvalue.is_wide_m) { |
|
* // pre_shift must be zero when reached here. |
|
* t = (n * rvalue.m) >> 32; |
|
* result = n - t; |
|
* result >>= 1; |
|
* result += t; |
|
* result >>= rvalue.sh - 1; |
|
* } else { |
|
* if (pre_shift) |
|
* result = n >> pre_shift; |
|
* result = ((u64)result * rvalue.m) >> 32; |
|
* result >>= rvalue.sh; |
|
* } |
|
*/ |
|
struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); |
|
|
|
#endif /* _LINUX_RECIPROCAL_DIV_H */
|
|
|