diff --git a/src/besselj.jl b/src/besselj.jl index 34da648..32eb9bb 100644 --- a/src/besselj.jl +++ b/src/besselj.jl @@ -59,21 +59,11 @@ function _besselj0(x::Float64) q2 = (-1/8, 25/384, -1073/5120, 375733/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (c_x * c_xn - s_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - - # the following computes b = cos(x + xn) more accurately - # see src/misc.jl - b = cos_sum(x, xn) + xn = xinv*q + b = sin_sum(x, pi/4, xn) return a * b end end @@ -137,21 +127,11 @@ function _besselj1(x::Float64) q2 = (3/8, -21/128, 1899/5120, -543483/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -3 * PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return s * a * (c_x * c_xn - s_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -3 * PIO4(T)) - - # the following computes b = cos(x + xn) more accurately - # see src/misc.jl - b = cos_sum(x, xn) + xn = xinv*q + b = sin_sum(x, -pi/4, xn) return a * b * s end end @@ -182,7 +162,7 @@ end ##### besselj0(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli0(z/im) -besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im +besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im # Bessel functions of the first kind of order nu @@ -208,17 +188,17 @@ besselj1(z::T) where T <: Union{ComplexF32, ComplexF64} = besseli1(z/im) * im # # For values where the expansions for large arguments and orders are not valid, backward recurrence is employed after shifting the order up # to where `besseljy_debye` is accurate then using downward recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point. -# +# # [1] http://dlmf.nist.gov/10.2.E2 -# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." +# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." # Advances in Computational Mathematics 45.1 (2019): 173-211. -# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." +# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." # Applied and Computational Harmonic Analysis 1.1 (1993): 116-135. -# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. +# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. # Applied and Computational Harmonic Analysis, 39(2), 347-356. -# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." +# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." # Computer physics communications 76.3 (1993): 381-388. -# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. +# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. # Vol. 55. US Government printing office, 1964. # @@ -432,7 +412,7 @@ end # Cutoff for Float32 determined from using Float64 precision down to eps(Float32) besselj_series_cutoff(v, x::Float32) = (x < 20.0) || v > (14.4 + x*(-0.455 + 0.027*x)) besselj_series_cutoff(v, x::Float64) = (x < 7.0) || v > (2 + x*(0.109 + 0.062*x)) -# cutoff for Float128 for ~1e-35 relative error +# cutoff for Float128 for ~1e-35 relative error #besselj_series_cutoff(v, x::AbstractFloat) = (x < 4.0) || v > (x*(0.08 + 0.12*x)) #= @@ -467,7 +447,7 @@ end # On the other hand, shifting the order down avoids any concern about underflow for large orders # Shifting the order too high while keeping x fixed could result in numerical underflow # Therefore we need to shift up only until the `besseljy_debye` is accurate and need to test that no underflow occurs -# Shifting the order up decreases the value substantially for high orders and results +# Shifting the order up decreases the value substantially for high orders and results # in a stable forward recurrence as the values rapidly increase function besselj_recurrence(nu, x) # shift order up to where expansions are valid see src/U_polynomials.jl diff --git a/src/bessely.jl b/src/bessely.jl index fd50753..a66e4c1 100644 --- a/src/bessely.jl +++ b/src/bessely.jl @@ -64,7 +64,7 @@ function _bessely0_compute(x::Float64) z = w * w p = evalpoly(z, PP_y0(T)) / evalpoly(z, PQ_y0(T)) q = evalpoly(z, QP_y0(T)) / evalpoly(z, QQ_y0(T)) - xn = x - PIO4(T) + xn = x - pi/4 sc = sincos(xn) p = p * sc[1] + w * q * sc[2] return p * SQ2OPI(T) / sqrt(x) @@ -81,21 +81,11 @@ function _bessely0_compute(x::Float64) q2 = (-1/8, 25/384, -1073/5120, 375733/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (s_x * c_xn + c_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, -PIO4(T)) - - # the following computes b = sin(x + xn) more accurately - # see src/misc.jl - b = sin_sum(x, xn) + xn = xinv*q + b = sin_sum(x, -pi/4, xn) return a * b end end @@ -170,21 +160,11 @@ function _bessely1_compute(x::Float64) q2 = (3/8, -21/128, 1899/5120, -543483/229376) p = evalpoly(x2, p2) q = evalpoly(x2, q2) - if x > 1e15 - a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, - 3 * PIO4(T)) - s_x, c_x = sincos(x) - s_xn, c_xn = sincos(xn) - return a * (s_x * c_xn + c_x * s_xn) - end end a = SQ2OPI(T) * sqrt(xinv) * p - xn = muladd(xinv, q, - 3 * PIO4(T)) - - # the following computes b = sin(x + xn) more accurately - # see src/misc.jl - b = sin_sum(x, xn) + xn = xinv*q + b = sin_sum(-x, -pi/4, -xn) return a * b end end @@ -225,7 +205,7 @@ end # for positive arguments and orders. For integer orders up to 250, forward recurrence is used starting from # `bessely0` and `bessely1` routines for calculation of Y_{n}(x) of the zero and first order. # For small arguments, Y_{ν}(x) is calculated from the power series (`bessely_power_series(nu, x`) form of J_{ν}(x) using the connection formula [1]. -# +# # When x < ν and ν being reasonably large, the debye asymptotic expansion (Eq. 33; [3]) is used `besseljy_debye(nu, x)`. # When x > ν and x being reasonably large, the Hankel function is calculated from the debye expansion (Eq. 29; [3]) with `hankel_debye(nu, x)` # and Y_{n}(x) is calculated from the imaginary part of the Hankel function. @@ -234,20 +214,20 @@ end # # For values where the expansions for large arguments and orders are not valid, forward recurrence is employed after shifting the order down # to where these expansions are valid then using recurrence. In general, the routine will be the slowest when ν ≈ x as all methods struggle at this point. -# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7 +# Additionally, the Hankel expansion is only accurate (to Float64 precision) when x > 19 and the power series can only be computed for x < 7 # without loss of precision. Therefore, when x > 19 the Hankel expansion is used to generate starting values after shifting the order down. # When x ∈ (7, 19) and ν ∈ (0, 2) Chebyshev approximation is used with higher orders filled by recurrence. -# +# # [1] https://dlmf.nist.gov/10.2#E3 -# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." +# [2] Bremer, James. "An algorithm for the rapid numerical evaluation of Bessel functions of real orders and arguments." # Advances in Computational Mathematics 45.1 (2019): 173-211. -# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." +# [3] Matviyenko, Gregory. "On the evaluation of Bessel functions." # Applied and Computational Harmonic Analysis 1.1 (1993): 116-135. -# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. +# [4] Heitman, Z., Bremer, J., Rokhlin, V., & Vioreanu, B. (2015). On the asymptotics of Bessel functions in the Fresnel regime. # Applied and Computational Harmonic Analysis, 39(2), 347-356. -# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." +# [5] Ratis, Yu L., and P. Fernández de Córdoba. "A code to calculate (high order) Bessel functions based on the continued fractions method." # Computer physics communications 76.3 (1993): 381-388. -# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. +# [6] Abramowitz, Milton, and Irene A. Stegun, eds. Handbook of mathematical functions with formulas, graphs, and mathematical tables. # Vol. 55. US Government printing office, 1964. # @@ -389,7 +369,7 @@ No checks on arguments are performed and should only be called if certain nu, x """ function bessely_positive_args(nu, x::T) where T iszero(x) && return -T(Inf) - + # use forward recurrence if nu is an integer up until it becomes inefficient (isinteger(nu) && nu < 250) && return besselj_up_recurrence(x, bessely1(x), bessely0(x), 1, nu)[1] @@ -418,7 +398,7 @@ end # combined to calculate both J_v and J_{-v} in the same loop # J_{-v} always converges slower so just check that convergence # Combining the loop was faster than using two separate loops -# +# # this works well for small arguments x < 7.0 for rel. error ~1e-14 # this also works well for nu > 1.35x - 4.5 # for nu > 25 more cancellation occurs near integer values @@ -575,7 +555,7 @@ function log_bessely_power_series(v, x::T) where T logout2 = -v * log(x/2) + log(out2) spi, cpi = sincospi(v) - + #tmp = logout2 + log(abs(sign*(inv(spi)) - exp(logout - logout2) * cpi / spi)) tmp = logout + log((-cpi + exp(logout2) - logout) / spi) diff --git a/src/misc.jl b/src/misc.jl index 6be8c72..b456d81 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -1,47 +1,92 @@ -# function to more accurately compute cos(x + xn) -# see https://github.com/heltonmc/Bessels.jl/pull/13 -# written by @oscardssmith -function cos_sum(x, xn) - s = x + xn - n, r = reduce_pi02_med(s) - lo = r.lo - ((s - x) - xn) - hi = r.hi + lo - y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo) - n = n&3 +using Base.Math: sin_kernel, cos_kernel, sincos_kernel, rem_pio2_kernel, DoubleFloat64, DoubleFloat32 + +""" + computes sin(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sin(sum(xs)) +""" +function sin_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat + n, y = rem_pio2_sum(xs...) + n &= 3 if n == 0 - return Base.Math.cos_kernel(y) + return sin_kernel(y) elseif n == 1 - return -Base.Math.sin_kernel(y) + return cos_kernel(y) elseif n == 2 - return -Base.Math.cos_kernel(y) + return -sin_kernel(y) else - return Base.Math.sin_kernel(y) + return -cos_kernel(y) end end -# function to more accurately compute sin(x + xn) -function sin_sum(x, xn) - s = x + xn - n, r = reduce_pi02_med(s) - lo = r.lo - ((s - x) - xn) - hi = r.hi + lo - y = Base.Math.DoubleFloat64(hi, r.hi-hi+lo) - n = n&3 + +""" + computes sincos(sum(xs)) where xs are sorted by absolute value + Doing this is much more accurate than the naive sincos(sum(xs)) +""" +function sincos_sum(xs::Vararg{T})::T where T<:Base.IEEEFloat + n, y = rem_pio2_sum(xs...) + n &= 3 + si, co = sincos_kernel(y) if n == 0 - return Base.Math.sin_kernel(y) + return si, co elseif n == 1 - return Base.Math.cos_kernel(y) + return co, -si elseif n == 2 - return -Base.Math.sin_kernel(y) + return -si, -co else - return -Base.Math.cos_kernel(y) + return -co, si + end +end + +function rem_pio2_sum(xs::Vararg{Float64}) + n = 0 + hi, lo = 0.0, 0.0 + for x in xs + if abs(x) <= pi/4 + s = x + hi + lo += (x - (s - hi)) + else + n_i, y = rem_pio2_kernel(x) + n += n_i + s = y.hi + hi + lo += (y.hi - (s - hi)) + y.lo + end + hi = s + end + while hi > pi/4 + hi -= pi/2 + lo -= 6.123233995736766e-17 + n += 1 + end + while hi < -pi/4 + hi += pi/2 + lo += 6.123233995736766e-17 + n -= 1 + end + return n, DoubleFloat64(hi, lo) +end + +function rem_pio2_sum(xs::Vararg{Float32}) + y = 0.0 + n = 0 + # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9 + # so reducing at 2^22 prevents catastrophic loss of precision. + # There probably is a case where this loses some digits but it is a decent + # tradeoff between accuracy and speed. + @fastmath for x in xs + if x > 0x1p22 + n_i, y_i = rem_pio2_kernel(Float32(x)) + n += n_i + y += y_i.hi + else + y += x + end end + n_i, y = rem_pio2_kernel(y) + return n + n_i, DoubleFloat32(y.hi) end -@inline function reduce_pi02_med(x::Float64) - pio2_1 = 1.57079632673412561417e+00 - fn = round(x*(2/pi)) - r = muladd(-fn, pio2_1, x) - w = fn * 6.07710050650619224932e-11 - y = r-w - return unsafe_trunc(Int, fn), Base.Math.DoubleFloat64(y, (r-y)-w) +function rem_pio2_sum(xs::Vararg{Float16}) + y = sum(Float64, xs) #Float16 can be losslessly accumulated in Float64 + n, y = rem_pio2_kernel(y) + return n, DoubleFloat32(y.hi) end