diff options
Diffstat (limited to 'lib/math')
| -rw-r--r-- | lib/math/div64.c | 185 | ||||
| -rw-r--r-- | lib/math/test_mul_u64_u64_div_u64.c | 191 |
2 files changed, 265 insertions, 111 deletions
diff --git a/lib/math/div64.c b/lib/math/div64.c index bf77b9843175..d1e92ea24fce 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -177,94 +177,157 @@ EXPORT_SYMBOL(div64_s64); * Iterative div/mod for use when dividend is not expected to be much * bigger than divisor. */ +#ifndef iter_div_u64_rem u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); +#endif -#ifndef mul_u64_u64_div_u64 -u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) -{ - if (ilog2(a) + ilog2(b) <= 62) - return div64_u64(a * b, c); +#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64) -#if defined(__SIZEOF_INT128__) +#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c) +#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64) +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ /* native 64x64=128 bits multiplication */ - u128 prod = (u128)a * b; - u64 n_lo = prod, n_hi = prod >> 64; + u128 prod = (u128)a * b + c; + *p_lo = prod; + return prod >> 64; +} #else - - /* perform a 64x64=128 bits multiplication manually */ - u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32; +static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c) +{ + /* perform a 64x64=128 bits multiplication in 32bit chunks */ u64 x, y, z; - x = (u64)a_lo * b_lo; - y = (u64)a_lo * b_hi + (u32)(x >> 32); - z = (u64)a_hi * b_hi + (u32)(y >> 32); - y = (u64)a_hi * b_lo + (u32)y; - z += (u32)(y >> 32); - x = (y << 32) + (u32)x; - - u64 n_lo = x, n_hi = z; + /* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */ + x = mul_add(a, b, c); + y = mul_add(a, b >> 32, c >> 32); + y = add_u64_u32(y, x >> 32); + z = mul_add(a >> 32, b >> 32, y >> 32); + y = mul_add(a >> 32, b, y); + *p_lo = (y << 32) + (u32)x; + return add_u64_u32(z, y >> 32); +} +#endif +#ifndef BITS_PER_ITER +#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16) #endif - /* make sure c is not zero, trigger runtime exception otherwise */ - if (unlikely(c == 0)) { - unsigned long zero = 0; +#if BITS_PER_ITER == 32 +#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c) +#define add_u64_long(a, b) ((a) + (b)) +#else +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 +static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c) +{ + u64 n_lo = mul_add(a, b, c); + u64 n_med = mul_add(a >> 32, b, c >> 32); - OPTIMIZER_HIDE_VAR(zero); - return ~0UL/zero; - } + n_med = add_u64_u32(n_med, n_lo >> 32); + *p_lo = n_med << 32 | (u32)n_lo; + return n_med >> 32; +} + +#define add_u64_long(a, b) add_u64_u32(a, b) +#endif + +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d) +{ + unsigned long d_msig, q_digit; + unsigned int reps, d_z_hi; + u64 quotient, n_lo, n_hi; + u32 overflow; - int shift = __builtin_ctzll(c); + n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c); - /* try reducing the fraction in case the dividend becomes <= 64 bits */ - if ((n_hi >> shift) == 0) { - u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; + if (!n_hi) + return div64_u64(n_lo, d); - return div64_u64(n, c >> shift); - /* - * The remainder value if needed would be: - * res = div64_u64_rem(n, c >> shift, &rem); - * rem = (rem << shift) + (n_lo - (n << shift)); - */ - } + if (unlikely(n_hi >= d)) { + /* trigger runtime exception if divisor is zero */ + if (d == 0) { + unsigned long zero = 0; - if (n_hi >= c) { + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } /* overflow: result is unrepresentable in a u64 */ - return -1; + return ~0ULL; } - /* Do the full 128 by 64 bits division */ - - shift = __builtin_clzll(c); - c <<= shift; + /* Left align the divisor, shifting the dividend to match */ + d_z_hi = __builtin_clzll(d); + if (d_z_hi) { + d <<= d_z_hi; + n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi); + n_lo <<= d_z_hi; + } - int p = 64 + shift; - u64 res = 0; - bool carry; + reps = 64 / BITS_PER_ITER; + /* Optimise loop count for small dividends */ + if (!(u32)(n_hi >> 32)) { + reps -= 32 / BITS_PER_ITER; + n_hi = n_hi << 32 | n_lo >> 32; + n_lo <<= 32; + } +#if BITS_PER_ITER == 16 + if (!(u32)(n_hi >> 48)) { + reps--; + n_hi = add_u64_u32(n_hi << 16, n_lo >> 48); + n_lo <<= 16; + } +#endif - do { - carry = n_hi >> 63; - shift = carry ? 1 : __builtin_clzll(n_hi); - if (p < shift) - break; - p -= shift; - n_hi <<= shift; - n_hi |= n_lo >> (64 - shift); - n_lo <<= shift; - if (carry || (n_hi >= c)) { - n_hi -= c; - res |= 1ULL << p; + /* Invert the dividend so we can use add instead of subtract. */ + n_lo = ~n_lo; + n_hi = ~n_hi; + + /* + * Get the most significant BITS_PER_ITER bits of the divisor. + * This is used to get a low 'guestimate' of the quotient digit. + */ + d_msig = (d >> (64 - BITS_PER_ITER)) + 1; + + /* + * Now do a 'long division' with BITS_PER_ITER bit 'digits'. + * The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits. + * The worst case is dividing ~0 by 0x8000 which requires two subtracts. + */ + quotient = 0; + while (reps--) { + q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig; + /* Shift 'n' left to align with the product q_digit * d */ + overflow = n_hi >> (64 - BITS_PER_ITER); + n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER)); + n_lo <<= BITS_PER_ITER; + /* Add product to negated divisor */ + overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi); + /* Adjust for the q_digit 'guestimate' being low */ + while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) { + q_digit++; + n_hi += d; + overflow += n_hi < d; } - } while (n_hi); - /* The remainder value if needed would be n_hi << p */ + quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit); + } - return res; + /* + * The above only ensures the remainder doesn't overflow, + * it can still be possible to add (aka subtract) another copy + * of the divisor. + */ + if ((n_hi + d) > n_hi) + quotient++; + return quotient; } -EXPORT_SYMBOL(mul_u64_u64_div_u64); +#if !defined(test_mul_u64_add_u64_div_u64) +EXPORT_SYMBOL(mul_u64_add_u64_div_u64); +#endif #endif diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c index 58d058de4e73..338d014f0c73 100644 --- a/lib/math/test_mul_u64_u64_div_u64.c +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -10,80 +10,141 @@ #include <linux/printk.h> #include <linux/math64.h> -typedef struct { u64 a; u64 b; u64 c; u64 result; } test_params; +typedef struct { u64 a; u64 b; u64 d; u64 result; uint round_up;} test_params; static test_params test_values[] = { /* this contains many edge values followed by a couple random values */ -{ 0xb, 0x7, 0x3, 0x19 }, -{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000 }, -{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001 }, -{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000 }, -{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000 }, -{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab }, -{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000 }, -{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff }, -{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851 }, -{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554 }, -{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3 }, -{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2 }, -{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007 }, -{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001 }, -{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001 }, -{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002 }, -{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8 }, -{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca }, -{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b }, -{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9 }, -{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe }, -{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18 }, -{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35 }, -{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f }, -{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961 }, -{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216 }, +{ 0xb, 0x7, 0x3, 0x19, 1 }, +{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000, 0 }, +{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001, 0 }, +{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000, 1 }, +{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000, 1 }, +{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab, 0 }, +{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000, 1 }, +{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff, 1 }, +{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851, 1 }, +{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554, 1 }, +{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3, 1 }, +{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2, 1 }, +{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007, 1 }, +{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001, 0 }, +{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001, 1 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002, 1 }, +{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8, 1 }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca, 1 }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b, 1 }, +{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9, 1 }, +{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe, 0 }, +{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18, 1 }, +{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35, 1 }, +{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f, 1 }, +{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961, 1 }, +{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216, 1 }, }; /* * The above table can be verified with the following shell script: - * - * #!/bin/sh - * sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4/p' \ - * lib/math/test_mul_u64_u64_div_u64.c | - * while read a b c r; do - * expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $c | bc ) - * given=$( printf "%X\n" $r ) - * if [ "$expected" = "$given" ]; then - * echo "$a * $b / $c = $r OK" - * else - * echo "$a * $b / $c = $r is wrong" >&2 - * echo "should be equivalent to 0x$expected" >&2 - * exit 1 - * fi - * done + +#!/bin/sh +sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4 \5/p' \ + lib/math/test_mul_u64_u64_div_u64.c | +while read a b d r e; do + expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $d | bc ) + given=$( printf "%X\n" $r ) + if [ "$expected" = "$given" ]; then + echo "$a * $b / $d = $r OK" + else + echo "$a * $b / $d = $r is wrong" >&2 + echo "should be equivalent to 0x$expected" >&2 + exit 1 + fi + expected=$( printf "obase=16; ibase=16; (%X * %X + %X) / %X\n" $a $b $((d-1)) $d | bc ) + given=$( printf "%X\n" $((r + e)) ) + if [ "$expected" = "$given" ]; then + echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) OK" + else + echo "$a * $b +/ $d = $(printf '%#x' $((r + e))) is wrong" >&2 + echo "should be equivalent to 0x$expected" >&2 + exit 1 + fi +done + */ -static int __init test_init(void) +static u64 test_mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); +#if __LONG_WIDTH__ >= 64 +#define TEST_32BIT_DIV +static u64 test_mul_u64_add_u64_div_u64_32bit(u64 a, u64 b, u64 c, u64 d); +#endif + +static int __init test_run(unsigned int fn_no, const char *fn_name) { + u64 start_time; + int errors = 0; + int tests = 0; int i; - pr_info("Starting mul_u64_u64_div_u64() test\n"); + start_time = ktime_get_ns(); for (i = 0; i < ARRAY_SIZE(test_values); i++) { u64 a = test_values[i].a; u64 b = test_values[i].b; - u64 c = test_values[i].c; + u64 d = test_values[i].d; u64 expected_result = test_values[i].result; - u64 result = mul_u64_u64_div_u64(a, b, c); + u64 result, result_up; + + switch (fn_no) { + default: + result = mul_u64_u64_div_u64(a, b, d); + result_up = mul_u64_u64_div_u64_roundup(a, b, d); + break; + case 1: + result = test_mul_u64_add_u64_div_u64(a, b, 0, d); + result_up = test_mul_u64_add_u64_div_u64(a, b, d - 1, d); + break; +#ifdef TEST_32BIT_DIV + case 2: + result = test_mul_u64_add_u64_div_u64_32bit(a, b, 0, d); + result_up = test_mul_u64_add_u64_div_u64_32bit(a, b, d - 1, d); + break; +#endif + } + + tests += 2; if (result != expected_result) { - pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, c); + pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, d); pr_err("ERROR: expected result: %016llx\n", expected_result); pr_err("ERROR: obtained result: %016llx\n", result); + errors++; + } + expected_result += test_values[i].round_up; + if (result_up != expected_result) { + pr_err("ERROR: 0x%016llx * 0x%016llx +/ 0x%016llx\n", a, b, d); + pr_err("ERROR: expected result: %016llx\n", expected_result); + pr_err("ERROR: obtained result: %016llx\n", result_up); + errors++; } } - pr_info("Completed mul_u64_u64_div_u64() test\n"); + pr_info("Completed %s() test, %d tests, %d errors, %llu ns\n", + fn_name, tests, errors, ktime_get_ns() - start_time); + return errors; +} + +static int __init test_init(void) +{ + pr_info("Starting mul_u64_u64_div_u64() test\n"); + if (test_run(0, "mul_u64_u64_div_u64")) + return -EINVAL; + if (test_run(1, "test_mul_u64_u64_div_u64")) + return -EINVAL; +#ifdef TEST_32BIT_DIV + if (test_run(2, "test_mul_u64_u64_div_u64_32bit")) + return -EINVAL; +#endif return 0; } @@ -91,6 +152,36 @@ static void __exit test_exit(void) { } +/* Compile the generic mul_u64_add_u64_div_u64() code */ +#undef __div64_32 +#define __div64_32 __div64_32 +#define div_s64_rem div_s64_rem +#define div64_u64_rem div64_u64_rem +#define div64_u64 div64_u64 +#define div64_s64 div64_s64 +#define iter_div_u64_rem iter_div_u64_rem + +#undef mul_u64_add_u64_div_u64 +#define mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 +#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64 + +#include "div64.c" + +#ifdef TEST_32BIT_DIV +/* Recompile the generic code for 32bit long */ +#undef test_mul_u64_add_u64_div_u64 +#define test_mul_u64_add_u64_div_u64 test_mul_u64_add_u64_div_u64_32bit +#undef BITS_PER_ITER +#define BITS_PER_ITER 16 + +#define mul_u64_u64_add_u64 mul_u64_u64_add_u64_32bit +#undef mul_u64_long_add_u64 +#undef add_u64_long +#undef mul_add + +#include "div64.c" +#endif + module_init(test_init); module_exit(test_exit); |
