diff --git a/docs/src/index.md b/docs/src/index.md
index 49c58a2..a90346c 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -8,7 +8,7 @@ CurrentModule = PSFModels
 [![Build Status](https://github.com/juliaastro/PSFModels.jl/workflows/CI/badge.svg?branch=main)](https://github.com/juliaastro/PSFModels.jl/actions)
 [![PkgEval](https://juliaci.github.io/NanosoldierReports/pkgeval_badges/P/PSFModels.svg)](https://juliaci.github.io/NanosoldierReports/pkgeval_badges/report.html)
 [![Coverage](https://codecov.io/gh/juliaastro/PSFModels.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/juliaastro/PSFModels.jl)
-[![License](https://img.shields.io/github/license/JuliaHCI/HCIToolbox.jl?color=yellow)](https://github.com/juliaastro/PSFModels.jl/blob/main/LICENSE)
+[![License](https://img.shields.io/github/license/juliaastro/PSFModels.jl?color=yellow)](https://github.com/juliaastro/PSFModels.jl/blob/main/LICENSE)
 
 ## Installation
 
diff --git a/src/fitting.jl b/src/fitting.jl
index 2c13609..8b1316e 100644
--- a/src/fitting.jl
+++ b/src/fitting.jl
@@ -6,7 +6,7 @@ const Model = Union{typeof(gaussian), typeof(normal), typeof(airydisk), typeof(m
 
 Fit a PSF model (`model`) defined by the given `params` as a named tuple of the parameters to fit and their default values. This model is fit to the data in `image` at the specified `inds` (by default, the entire array). To pass extra keyword arguments to the `model` (i.e., to "freeze" a parameter), pass them in a named tuple to `func_kwargs`. The default loss function is the chi-squared loss, which uses the the square of the difference (i.e., the L2 norm). You can change this to the L1 norm, for example, by passing `loss=abs`. The maximum FWHM can be set with `maxfwhm` as a number or tuple.
 
-Additional keyword arguments, as well as the fitting algorithm `alg`, are passed to `Optim.optimize`. By default we use forward-mode auto-differentiation (AD) to derive Jacobians for the LBFGS optimization algorithm. Refer to the [Optim.jl documentation](https://julianlsolvers.github.io/Optim.jl/stable/) for more information.
+Additional keyword arguments, as well as the fitting algorithm `alg`, are passed to `Optim.optimize`. By default we use forward-mode auto-differentiation (AD) to derive Jacobians for the [Newton with Trust Region](https://julianlsolvers.github.io/Optim.jl/stable/#algo/newton_trust_region/) optimization algorithm. Refer to the [Optim.jl documentation](https://julianlsolvers.github.io/Optim.jl/stable/) for more information.
 
 # Choosing parameters
 
@@ -70,37 +70,48 @@ function fit(model::Model,
              inds=axes(image);
              func_kwargs=(;),
              loss=abs2,
-             alg=LBFGS(),
+             alg=NewtonTrustRegion(),
              maxfwhm=Inf,
              kwargs...) where T
     _keys = keys(params)
+
+    _loss = build_loss_function(model, params, image, inds; func_kwargs, loss, maxfwhm)
+    X0 = vector_from_params(T, params)
+    result = optimize(_loss, X0, alg; autodiff=:forward, kwargs...)
+    Optim.converged(result) || @warn "optimizer did not converge" result
+    X = Optim.minimizer(result)
+    P_best = generate_params(_keys, X)
+    return P_best, model(T; P_best..., func_kwargs...)
+end
+
+function build_loss_function(model::Model, params, image, inds=axes(image); func_kwargs=(;), loss=abs2, maxfwhm=Inf)
+    _keys = keys(params)
     cartinds = CartesianIndices(inds)
     minind = map(minimum, inds)
     maxind = map(maximum, inds)
-    function _loss(X::AbstractVector{T}) where T
+    @inline function loss_function(X::AbstractVector{T}) where T
         P = generate_params(_keys, X)
+        # position is within stamp
         minind[1] - 0.5 ≤ P.x ≤ maxind[1] + 0.5 || return T(Inf)
         minind[2] - 0.5 ≤ P.y ≤ maxind[2] + 0.5 || return T(Inf)
+        # fwhm is non-negative and below max value
         all(0 .< P.fwhm .< maxfwhm) || return T(Inf)
+        # ratio is strictly (0, 1)
         if :ratio in _keys
             0 < P.ratio < 1 || return T(Inf)
         end
+        # avoid circular degeneracy with theta
         if :theta in _keys
             -45 < P.theta < 45 || return T(Inf)
         end
-        # mean of errors (by default, with L2 norm == chi-squared)
-        mse = mean(cartinds) do idx
+        # sum of errors (by default, with L2 norm == chi-squared)
+        chi2 = sum(cartinds) do idx
             resid = model(T, idx; P..., func_kwargs...) - image[idx]
             return loss(resid)
         end
-        return mse
+        return chi2
     end
-    X0 = vector_from_params(T, params)
-    result = optimize(_loss, X0, alg; autodiff=:forward, kwargs...)
-    Optim.converged(result) || @warn "optimizer did not converge" result
-    X = Optim.minimizer(result)
-    P_best = generate_params(_keys, X)
-    return P_best, model(T; P_best..., func_kwargs...)
+    return loss_function
 end
 
 function vector_from_params(T, params)