Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better sin_sum #88

Merged
merged 3 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 13 additions & 33 deletions src/besselj.jl
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that the Float32 implementations still use p = p * cos(xn + x). Should this be switched over to the better sum formula now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

possibly. it will be a bit slower, so it probably needs a benchmark and accuracy plot to determine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can check it out tomorrow ! This weekend was busy with family but have time to look tomorrow more carefully

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeesh - the besselj0 implementation was some of my first julia code which has been improved but the tests aren't very robust.

julia> Bessels.besselj0(328049.34f0)
-0.0013240778f0

julia> Bessels.besselj0(Float64(328049.34f0))
-0.0013258623831875669

I'm imagining the better sin sum here could help.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. I posted an issue at #90 to track that in a separate PR.

Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,11 @@ function _besselj0(x::Float64)
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (c_x * c_xn - s_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))

# the following computes b = cos(x + xn) more accurately
# see src/misc.jl
b = cos_sum(x, xn)
xn = xinv*q
b = sin_sum(x, pi/4, xn)
return a * b
end
end
Expand Down Expand Up @@ -137,21 +127,11 @@ function _besselj1(x::Float64)
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -3 * PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return s * a * (c_x * c_xn - s_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -3 * PIO4(T))

# the following computes b = cos(x + xn) more accurately
# see src/misc.jl
b = cos_sum(x, xn)
xn = xinv*q
b = sin_sum(x, -pi/4, xn)
return a * b * s
end
end
Expand Down Expand Up @@ -182,7 +162,7 @@ end
#####

besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im)
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im


# Bessel functions of the first kind of order nu
Expand All @@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im
#
# For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up
# to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
#
#
# [1] http://dlmf.nist.gov/10.2.E2
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# Advances in Computational Mathematics 45.1 (2019): 173-211.
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# Computer physics communications 76.3 (1993): 381-388.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# Vol. 55. US Government printing office, 1964.
#

Expand Down Expand Up @@ -432,7 +412,7 @@ end
# Cutoff for Float32 determined from using Float64 precision down to eps(Float32)
besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x))
besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x))
# cutoff for Float128 for ~1e-35 relative error
# cutoff for Float128 for ~1e-35 relative error
#besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x))

#=
Expand Down Expand Up @@ -467,7 +447,7 @@ end
# On the other hand, shifting the order down avoids any concern about underflow for large orders
# Shifting the order too high while keeping x fixed could result in numerical underflow
# Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs
# Shifting the order up decreases the value substantially for high orders and results
# Shifting the order up decreases the value substantially for high orders and results
# in a stable forward recurrence as the values rapidly increase
function besselj_recurrence(nu, x)
# shift order up to where expansions are valid see src/U_polynomials.jl
Expand Down
52 changes: 16 additions & 36 deletions src/bessely.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64)
z = w * w
p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T))
q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T))
xn = x - PIO4(T)
xn = x - pi/4
sc = sincos(xn)
p = p * sc[1] + w * q * sc[2]
return p * SQ2OPI(T) / sqrt(x)
Expand All @@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64)
q2 = (-1/8, 25/384, -1073/5120, 375733/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (s_x * c_xn + c_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, -PIO4(T))

# the following computes b = sin(x + xn) more accurately
# see src/misc.jl
b = sin_sum(x, xn)
xn = xinv*q
b = sin_sum(x, -pi/4, xn)
return a * b
end
end
Expand Down Expand Up @@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64)
q2 = (3/8, -21/128, 1899/5120, -543483/229376)
p = evalpoly(x2, p2)
q = evalpoly(x2, q2)
if x > 1e15
a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, - 3 * PIO4(T))
s_x, c_x = sincos(x)
s_xn, c_xn = sincos(xn)
return a * (s_x * c_xn + c_x * s_xn)
end
end

a = SQ2OPI(T) * sqrt(xinv) * p
xn = muladd(xinv, q, - 3 * PIO4(T))

# the following computes b = sin(x + xn) more accurately
# see src/misc.jl
b = sin_sum(x, xn)
xn = xinv*q
b = sin_sum(-x, -pi/4, -xn)
return a * b
end
end
Expand Down Expand Up @@ -225,7 +205,7 @@ end
# for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from
# `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order.
# For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1].
#
#
# When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`.
# When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)`
# and Y_{n}(x) is calculated from the imaginary part of the Hankel function.
Expand All @@ -234,20 +214,20 @@ end
#
# For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down
# to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point.
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7
# without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down.
# When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence.
#
#
# [1] https://dlmf.nist.gov/10.2#E3
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments."
# Advances in Computational Mathematics 45.1 (2019): 173-211.
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions."
# Applied and Computational Harmonic Analysis 1.1 (1993): 116-135.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime.
# Applied and Computational Harmonic Analysis, 39(2), 347-356.
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method."
# Computer physics communications 76.3 (1993): 381-388.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables.
# Vol. 55. US Government printing office, 1964.
#

Expand Down Expand Up @@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x
"""
function bessely_positive_args(nu, x::T) where T
iszero(x) && return -T(Inf)

# use forward recurrence if nu is an integer up until it becomes inefficient
(isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1]

Expand Down Expand Up @@ -418,7 +398,7 @@ end
# combined to calculate both J_v and J_{-v} in the same loop
# J_{-v} always converges slower so just check that convergence
# Combining the loop was faster than using two separate loops
#
#
# this works well for small arguments x < 7.0 for rel. error ~1e-14
# this also works well for nu > 1.35x - 4.5
# for nu > 25 more cancellation occurs near integer values
Expand Down Expand Up @@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T
logout2 = -v * log(x/2) + log(out2)

spi, cpi = sincospi(v)

#tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi))
tmp = logout + log((-cpi + exp(logout2) - logout) / spi)

Expand Down
111 changes: 78 additions & 33 deletions src/misc.jl
Original file line number Diff line number Diff line change
@@ -1,47 +1,92 @@
# function to more accurately compute cos(x + xn)
# see https://github.com/heltonmc/Bessels.jl/pull/13
# written by @oscardssmith
function cos_sum(x, xn)
s = x + xn
n, r = reduce_pi02_med(s)
lo = r.lo - ((s - x) - xn)
hi = r.hi + lo
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
n = n&3
using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32

"""
computes sin(sum(xs)) where xs are sorted by absolute value
Doing this is much more accurate than the naive sin(sum(xs))
"""
function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
n, y = rem_pio2_sum(xs...)
n &= 3
if n == 0
return Base.Math.cos_kernel(y)
return sin_kernel(y)
elseif n == 1
return -Base.Math.sin_kernel(y)
return cos_kernel(y)
elseif n == 2
return -Base.Math.cos_kernel(y)
return -sin_kernel(y)
else
return Base.Math.sin_kernel(y)
return -cos_kernel(y)
end
end
# function to more accurately compute sin(x + xn)
function sin_sum(x, xn)
s = x + xn
n, r = reduce_pi02_med(s)
lo = r.lo - ((s - x) - xn)
hi = r.hi + lo
y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo)
n = n&3

"""
computes sincos(sum(xs)) where xs are sorted by absolute value
Doing this is much more accurate than the naive sincos(sum(xs))
"""
function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat
n, y = rem_pio2_sum(xs...)
n &= 3
si, co = sincos_kernel(y)
if n == 0
return Base.Math.sin_kernel(y)
return si, co
elseif n == 1
return Base.Math.cos_kernel(y)
return co, -si
elseif n == 2
return -Base.Math.sin_kernel(y)
return -si, -co
else
return -Base.Math.cos_kernel(y)
return -co, si
end
end

function rem_pio2_sum(xs::Vararg{Float64})
n = 0
hi, lo = 0.0, 0.0
for x in xs
if abs(x) <= pi/4
s = x + hi
lo += (x - (s - hi))
else
n_i, y = rem_pio2_kernel(x)
n += n_i
s = y.hi + hi
lo += (y.hi - (s - hi)) + y.lo
end
hi = s
end
while hi > pi/4
hi -= pi/2
lo -= 6.123233995736766e-17
n += 1
end
while hi < -pi/4
hi += pi/2
lo += 6.123233995736766e-17
n -= 1
end
return n, DoubleFloat64(hi, lo)
end

function rem_pio2_sum(xs::Vararg{Float32})
y = 0.0
n = 0
# The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9
# so reducing at 2^22 prevents catastrophic loss of precision.
# There probably is a case where this loses some digits but it is a decent
# tradeoff between accuracy and speed.
@fastmath for x in xs
if x > 0x1p22
n_i, y_i = rem_pio2_kernel(Float32(x))
n += n_i
y += y_i.hi
else
y += x
end
end
n_i, y = rem_pio2_kernel(y)
return n + n_i, DoubleFloat32(y.hi)
end
@inline function reduce_pi02_med(x::Float64)
pio2_1 = 1.57079632673412561417e+00

fn = round(x*(2/pi))
r = muladd(-fn, pio2_1, x)
w = fn * 6.07710050650619224932e-11
y = r-w
return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w)
function rem_pio2_sum(xs::Vararg{Float16})
y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64
n, y = rem_pio2_kernel(y)
return n, DoubleFloat32(y.hi)
end