From f6d375c0231747ff9b044b7494d43904c2f786ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?=
 <tomas.drvostep@gmail.com>
Date: Fri, 21 Jun 2024 17:21:52 +0200
Subject: [PATCH] Use the formulas from Hacker's delight for both signed and
 unsigned Ints

---
 src/fldmod-by-const.jl        | 142 +++++++++-------------------------
 test/fldmod-by-const_tests.jl |  32 +-------
 2 files changed, 39 insertions(+), 135 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 1562a4a..6c62ce3 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -68,93 +68,44 @@ end
     return x - quotient * y
 end
 
-# This function is based on the native code produced by the following:
-# @code_native ((x)->div(x, 100))(Int64(2))
-function div_by_const(x::T, ::Val{C}) where {T, C}
-    # These checks will be compiled away during specialization.
-    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
-    # checks allow this function to work for any `C > 0`, in case that's useful in the
-    # future.
-    if C == 1
-        return x
-    elseif ispow2(C)
-        return div(x, C)  # Will already do the right thing
-    elseif C <= 0
-        throw(DomainError("C must be > 0"))
-    end
-    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
-    # runtime.
-    inverse_coeff, toshift = calculate_inverse_coeff(T, C)
-    # Compute the upper-half of widemul(x, 2^nbits(T)/C).
-    # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the
-    # numerator of the multiplication, so that the result is equal to x/C.
-    out = mul_hi(x, inverse_coeff)
-    # This condition will be compiled away during specialization.
-    if T <: Signed
-        # Because our magic number has a leading one (since we shift all-the-way left), the
-        # result is negative if it's Signed. We add x to give us the positive equivalent.
-        out += x
-        signshift = (nbits(x) - 1)
-        isnegative = T(out >>> signshift)  # 1 if < 0 else 0 (Unsigned bitshift to read top bit)
-    end
-    # Undo the bitshifts used to calculate the invoeff magic number with maximum precision.
-    out = out >> toshift
-    if T <: Signed
-        out =  out + isnegative
+# Unsigned magic number computation + shift by constant
+# See Hacker's delight, equations (26) and (27) from Chapter 10-9.
+# nmax and divisor must be > 0.
+Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
+    T = typeof(nmax)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = div(W(nmax) + W(1), d) * d - W(1)
+    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
+    for p in 0:2nbits
+        if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d))       # (27)
+            m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26)
+            return (m, p)
+        end
     end
-    return T(out)
+    return W(0), 0 # Should never reach here
 end
 
-
-
-
-# REQUIRES: C must not be 0, 1, -1
-# See this implementation in LLVM:
-# https://llvm.org/doxygen/DivisionByConstantInfo_8cpp_source.html
-# which was taken from "Hacker's Delight".
-Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
-    # First, calculate 2^nbits(T)/C
-    # We shift away leading zeros to preserve the most precision when we use it to multiply
-    # in the next step. At the end, we will shift the final answer back to undo this
-    # operation (which is why we need to return `toshift`).
-    # Note, also, that we calculate invcoeff at double-precision so that the left-shift
-    # doesn't leave trailing zeros. We truncate to only the upper-half before returning.
-    UT = _unsigned(T)
-    invcoeff = typemax(_widen(UT)) ÷ C
-    toshift = leading_zeros(invcoeff)
-    invcoeff = invcoeff << toshift
-    # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
-    # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
-    # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
-    two_to_N = _widen(typemax(UT)) + UT(1) # Precise value for 2^nbits(T) (doesn't fit in T)
-    invcoeff = _round_to_nearest(fldmod(invcoeff, two_to_N)..., two_to_N ) % T
-    return invcoeff, toshift
-end
-
-function mul_hi(x::T, y::T) where T
-    xy = _widemul(x, y)  # support Int256 -> Int512 (!!)
-    (xy >> nbits(T)) % T
-end
-
-
-
-# Unsigned magic number computation + divide by constant
-# Hacker's delight figure 10–4
-Base.@assume_effects :foldable function magicgu(nmax, d)
+# See Hacker's delight, equations (5) and (6) from Chapter 10-4.
+# nmax and divisor must be > 0.
+Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     T = typeof(nmax)
-    nc = (nmax ÷ d) * d - 1
-    N = T(floor(log2(nmax) + 1))
-    MathType = _widen(T)
-    two = MathType(2)
-    for p in 0:2N
-        if two^p > nc * (d - 1 - (2^p - 1) % d)
-            m = (two^p + d - 1 - (2^p - 1) % d) ÷ d
-            return (m%T, p)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = ((W(nmax) + W(1)) ÷ d) * d - W(1)
+    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
+    for p in 0:2nbits
+        if W(2)^p > nc * (d - rem(W(2)^p, d))       # (6)
+            m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5)
+            return (m, p)
         end
     end
-    error("Can't find p, something is wrong. nmax: $(repr(nmax)), d: $(repr(d)), N: $N")
+    return W(0), 0 # Should never reach here
 end
-function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
+
+function div_by_const(x::T, ::Val{C}) where {T, C}
     # These checks will be compiled away during specialization.
     # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
     # checks allow this function to work for any `C > 0`, in case that's useful in the
@@ -162,31 +113,14 @@ function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
     if C == 1
         return x
     elseif ispow2(C)
-        return div(x, C)  # Will already do the right thing
+        return div(x, C) # Will already do the right thing
     elseif C <= 0
         throw(DomainError("C must be > 0"))
     end
-    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
-    # runtime.
-    inverse_coeff, toshift = magicgu(typemax(T), C)
-    out = _widemul(x, inverse_coeff)  # support Int256 -> Int512 (!!)
-    out = out >> toshift
-    return out % T
-end
-
-
-
+    # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10.
+    magic_number, shift = magicg(typemax(T), C)
 
-
-
-# Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
-# https://github.com/rfourquet/BitIntegers.jl/pull/2
-# Note: We do not need this for Int512, since we only widen to 512 _after_ calling
-# _unsigned, above. This code is only for supporting the built-in integer types, which only
-# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for
-# other integer types, they will need to add methods to either _unsigned or unsigned.
-_unsigned(x) = unsigned(x)
-_unsigned(::Type{Int256}) = UInt256
-_unsigned(::Type{UInt256}) = UInt256
-
-nbits(x) = sizeof(x) * 8
+    out = _widemul(promote(x, magic_number)...)
+    out >>= shift
+    return (out % T) + (x < zero(T)) # TODO: Figure out why and if the + 1 if non-positive is necessary
+end
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 0adaa35..95dfce9 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -1,36 +1,6 @@
 using Test
 using FixedPointDecimals
 
-@testset "calculate_inverse_coeff signed" begin
-    using FixedPointDecimals: calculate_inverse_coeff
-
-    # The correct magic number here comes from investigating the following native code
-    # produced on an m2 aarch64 macbook pro:
-    # @code_native ((x)->fld(x,100))(2)
-    #   ...
-    #         mov     x8, #55051
-    #         movk    x8, #28835, lsl #16
-    #         movk    x8, #2621, lsl #32
-    #         movk    x8, #41943, lsl #48
-    # Where:
-    # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48
-    # -6640827866535438581
-    @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6)
-
-    # Same for the tests below:
-
-    # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
-    #  but the result is the same.)
-    @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
-end
-
-@testset "calculate_inverse_coeff signed 4" begin
-    using FixedPointDecimals: calculate_inverse_coeff
-
-    # Same here, our magic number is shifted 2 bits more than LLVM's
-    @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
-end
-
 @testset "div_by_const" begin
     vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
     for a_base in vals
@@ -53,7 +23,7 @@ end
     end
 end
 
-@testitem "fixed decimal multiplication - exhaustive 8-bit" begin
+@testset "fixed decimal multiplication - exhaustive 8-bit" begin
     @testset for P in (0,1)
         @testset for T in (Int8, UInt8)
             FD = FixedDecimal{T,P}