From 3d1dfe6355c50b4d21983b29fbe19a7eb99f5b4d Mon Sep 17 00:00:00 2001
From: Yutong Zhao <proteneer@gmail.com>
Date: Thu, 19 Sep 2024 15:22:26 -0400
Subject: [PATCH] Rebased

---
 examples/validate_solvent_perturbations.py    | 163 ++++++++
 tests/nonbonded/test_smoothcore_potential.py  | 225 ++++++++++
 timemachine/cpp/CMakeLists.txt                |   1 +
 timemachine/cpp/src/kernels/k_nonbonded.cuh   | 134 ++++++
 .../smoothcore_nonbonded_interaction_group.cu | 393 ++++++++++++++++++
 ...smoothcore_nonbonded_interaction_group.hpp | 103 +++++
 timemachine/cpp/src/wrap_kernels.cpp          |  93 +++++
 timemachine/fe/free_energy.py                 |   4 +
 timemachine/fe/rbfe.py                        |   3 +-
 timemachine/fe/single_topology.py             | 278 +++++++++----
 timemachine/fe/topology.py                    |  65 ++-
 timemachine/ff/handlers/nonbonded.py          |   1 +
 timemachine/lib/custom_ops.pyi                |   8 +
 timemachine/potentials/__init__.py            |   2 +
 timemachine/potentials/nonbonded.py           | 155 ++++++-
 timemachine/potentials/potentials.py          |  26 +-
 timemachine/testsystems/data/benzyl.sdf       | 274 ++++++++++++
 17 files changed, 1844 insertions(+), 84 deletions(-)
 create mode 100644 examples/validate_solvent_perturbations.py
 create mode 100644 tests/nonbonded/test_smoothcore_potential.py
 create mode 100644 timemachine/cpp/src/smoothcore_nonbonded_interaction_group.cu
 create mode 100644 timemachine/cpp/src/smoothcore_nonbonded_interaction_group.hpp
 create mode 100644 timemachine/testsystems/data/benzyl.sdf

diff --git a/examples/validate_solvent_perturbations.py b/examples/validate_solvent_perturbations.py
new file mode 100644
index 0000000000..8c38be38ec
--- /dev/null
+++ b/examples/validate_solvent_perturbations.py
@@ -0,0 +1,163 @@
+# usage:
+# python examples/validate_solvent_perturbations.py --ligands timemachine/testsystems/data/benzyl.sdf --mol_a_name benzene --mol_b_name benzene_bicycle --n_frames 200 --steps_per_frame 200 --n_eq_steps 5000 --seed 2024
+
+# 1) using smoothcores
+# 2) not-using smoothcores
+
+# bisection only, (main goal is to check # of windows emitted in the fast bisection stage)
+import argparse
+import copy
+import pickle
+
+import numpy as np
+
+from timemachine.constants import DEFAULT_ATOM_MAPPING_KWARGS
+from timemachine.fe import atom_mapping
+from timemachine.fe.free_energy import MDParams
+from timemachine.fe.rbfe import HostConfig, estimate_relative_free_energy_bisection
+from timemachine.fe.utils import plot_atom_mapping_grid, read_sdf
+from timemachine.ff import Forcefield
+from timemachine.md import builders
+
+
+def get_mol_by_name(mols, name):
+    for m in mols:
+        if m.GetProp("_Name") == name:
+            return m
+
+    assert 0, "Mol not found"
+
+
+def run_solvent(mol_a, mol_b, core, forcefield, md_params, use_smoothcore):
+    box_width = 4.0
+    solvent_sys, solvent_conf, solvent_box, solvent_top = builders.build_water_system(box_width, forcefield.water_ff)
+    solvent_box += np.diag([0.1, 0.1, 0.1])  # remove any possible clashes
+    solvent_host_config = HostConfig(solvent_sys, solvent_conf, solvent_box, len(solvent_conf))
+    solvent_res = estimate_relative_free_energy_bisection(
+        mol_a,
+        mol_b,
+        core,
+        forcefield,
+        solvent_host_config,
+        md_params=md_params,
+        prefix="solvent",
+        min_overlap=0.667,
+        use_smoothcore=use_smoothcore,
+    )
+
+    print("==statistics==")
+    print("lambda_schedule", [x.lamb for x in solvent_res.final_result.initial_states])
+    print("dGs", solvent_res.final_result.dGs)
+    print("num_windows", len(solvent_res.final_result.dGs) + 1)
+    print("sum(dGs)", np.sum(solvent_res.final_result.dGs))
+    print("norm(dG_errs)", np.linalg.norm(solvent_res.final_result.dG_errs))
+
+    mol_a_name = mol_a.GetProp("_Name")
+    mol_b_name = mol_b.GetProp("_Name")
+
+    with open(f"solvent_summary_overlap_{mol_a_name}_to_{mol_b_name}_{use_smoothcore}.png", "wb") as fh:
+        fh.write(solvent_res.plots.overlap_summary_png)
+
+    with open(f"solvent_detail_overlap_{mol_a_name}_to_{mol_b_name}_{use_smoothcore}.png", "wb") as fh:
+        fh.write(solvent_res.plots.overlap_detail_png)
+
+    with open(f"solvent_debug_{mol_a_name}_to_{mol_b_name}_{use_smoothcore}.pkl", "wb") as f:
+        pickle.dump(solvent_res, f)
+
+
+def read_from_args():
+    parser = argparse.ArgumentParser(description="Estimate relative solvation free energy between two ligands")
+    parser.add_argument(
+        "--n_frames", type=int, help="number of frames to use for the free energy estimate", required=True
+    )
+    parser.add_argument("--ligands", type=str, help="SDF file containing the ligands of interest", required=True)
+    parser.add_argument("--mol_a_name", type=str, help="name of the start molecule", required=True)
+    parser.add_argument("--mol_b_name", type=str, help="name of the end molecule", required=True)
+    parser.add_argument("--steps_per_frame", type=int, help="steps per frame", required=True)
+    parser.add_argument("--n_eq_steps", type=int, help="n eq steps", required=True)
+    parser.add_argument("--seed", type=int, help="Random number seed", required=True)
+
+    args = parser.parse_args()
+    mols = read_sdf(str(args.ligands))
+    mol_a = get_mol_by_name(mols, args.mol_a_name)
+    mol_b = get_mol_by_name(mols, args.mol_b_name)
+
+    md_params = MDParams(
+        n_frames=args.n_frames, n_eq_steps=args.n_eq_steps, steps_per_frame=args.steps_per_frame, seed=args.seed
+    )
+
+    ff = Forcefield.load_default()
+
+    atom_mapping_kwargs = copy.copy(DEFAULT_ATOM_MAPPING_KWARGS)
+    atom_mapping_kwargs["ring_matches_ring_only"] = False
+
+    all_cores = atom_mapping.get_cores(mol_a, mol_b, **atom_mapping_kwargs)
+
+    core = all_cores[0]
+
+    # Hydrogen Only
+    core = np.array([[8, 8]])
+
+    res = plot_atom_mapping_grid(mol_a, mol_b, core)
+    fpath = f"atom_mapping_{args.mol_a_name}_to_{args.mol_b_name}.svg"
+    print("core mapping written to", fpath)
+    with open(fpath, "w") as fh:
+        fh.write(res)
+
+    # debug
+    from timemachine.fe.single_topology import SingleTopology
+
+    st = SingleTopology(mol_a, mol_b, core, ff, use_smoothcore=True)
+    all_charges = []
+    all_eps = []
+    all_ws = []
+    all_mols = []
+    for lamb in np.linspace(0, 1, 10):
+        guest_ixn_env_params = st._get_smoothcore_guest_params(ff.q_handle, ff.lj_handle, lamb)
+        all_charges.append(guest_ixn_env_params[:, 0])
+        all_eps.append(guest_ixn_env_params[:, 2])
+        all_ws.append(guest_ixn_env_params[:, 3])
+        all_mols.append(st.mol(lamb))
+    all_charges = np.array(all_charges)
+    all_eps = np.array(all_eps)
+    all_ws = np.array(all_ws)
+    num_atoms = all_charges.shape[1]
+    dummy_a_atoms = st.dummy_a_idxs()
+    dummy_b_atoms = st.dummy_b_idxs()
+    import matplotlib.pyplot as plt
+
+    f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+    for atom_idx in range(num_atoms):
+        symbol = all_mols[0].GetAtomWithIdx(atom_idx).GetSymbol()
+        if atom_idx in dummy_a_atoms:
+            linestyle = "dashed"
+        elif atom_idx in dummy_b_atoms:
+            linestyle = "dotted"
+        else:
+            linestyle = "solid"
+        ax1.plot(all_charges[:, atom_idx], label=f"{symbol}{atom_idx}", linestyle=linestyle)
+        ax2.plot(all_ws[:, atom_idx], label=f"{symbol}{atom_idx}", linestyle=linestyle)
+        ax3.plot(all_eps[:, atom_idx], label=f"{symbol}{atom_idx}", linestyle=linestyle)
+    ax1.set_ylabel("charge")
+    ax1.set_xlabel("lambda_idx")
+
+    ax2.set_ylabel("w")
+    ax2.set_xlabel("lambda_idx")
+
+    ax3.set_ylabel("eps")
+    ax3.set_xlabel("lambda_idx")
+
+    ax3.legend()
+    mol_a_name = mol_a.GetProp("_Name")
+    mol_b_name = mol_b.GetProp("_Name")
+    plt.tight_layout()
+    plt.savefig(f"solvent_parameter_interpolation_{mol_a_name}_to_{mol_b_name}.png")
+    # assert 0
+    # debug
+
+    run_solvent(mol_a, mol_b, core, ff, md_params, use_smoothcore=True)
+    run_solvent(mol_a, mol_b, core, ff, md_params, use_smoothcore=False)
+
+
+if __name__ == "__main__":
+    read_from_args()
diff --git a/tests/nonbonded/test_smoothcore_potential.py b/tests/nonbonded/test_smoothcore_potential.py
new file mode 100644
index 0000000000..7324140177
--- /dev/null
+++ b/tests/nonbonded/test_smoothcore_potential.py
@@ -0,0 +1,225 @@
+import numpy as np
+from common import GradientTest
+
+from timemachine.fe.free_energy import HostConfig
+from timemachine.fe.single_topology import AtomMapFlags, AtomMapMixin, SingleTopology
+from timemachine.fe.system import convert_bps_into_system
+from timemachine.fe.utils import get_romol_conf
+from timemachine.ff import Forcefield
+from timemachine.ff.handlers import openmm_deserializer
+from timemachine.md.builders import build_protein_system
+from timemachine.md.minimizer import pre_equilibrate_host
+from timemachine.potentials import SmoothcoreNonbondedInteractionGroup
+from timemachine.potentials.nonbonded import (
+    MIN_SMOOTHCORE_EPS_SQRT,
+    MIN_SMOOTHCORE_SIG_HALF,
+    reference_nonbonded_interaction_group,
+    smoothcore_charge_interpolation,
+    smoothcore_lj_interpolation,
+)
+from timemachine.testsystems.relative import get_hif2a_ligand_pair_single_topology
+
+
+def get_hif2a_protein_coords_box_params(mol_a, mol_b, ff, minimize=False):
+    solvated_host_system, solvated_host_coords, box, topology, num_water_atoms = build_protein_system(
+        "timemachine/testsystems/data/hif2a_nowater_min.pdb", ff.protein_ff, ff.water_ff, [mol_a, mol_b]
+    )
+
+    host_bps, _ = openmm_deserializer.deserialize_system(solvated_host_system, cutoff=1.2)
+
+    host_system = convert_bps_into_system(host_bps)
+
+    if minimize:
+        hc = HostConfig(solvated_host_system, solvated_host_coords, box, num_water_atoms)
+        equil_coords, equil_box = pre_equilibrate_host([mol_a, mol_b], hc, ff)
+        assert equil_coords.shape == solvated_host_coords.shape
+        return equil_coords, equil_box, host_bps[-1].params, host_system, num_water_atoms
+    else:
+        return solvated_host_coords, box, host_bps[-1].params, host_system, num_water_atoms
+
+
+def test_smoothcore_charge_interpolation():
+    """
+    Test that:
+    1) charge interpolation along the schedule maintains net-neutral charges
+    2) end-states are consistent with the interpolate lambda values of 0 and 1
+    """
+    mol_a, mol_b, core = get_hif2a_ligand_pair_single_topology()
+
+    ff = Forcefield.load_default()
+
+    mol_a_charges = ff.q_handle.parameterize(mol_a)
+    mol_b_charges = ff.q_handle.parameterize(mol_b)
+    amm = AtomMapMixin(mol_a, mol_b, core)
+
+    charges_src = np.zeros(amm.get_num_atoms())
+    charges_dst = np.zeros(amm.get_num_atoms())
+
+    mol_a_idxs = amm.mol_a_idxs()
+    mol_b_idxs = amm.mol_b_idxs()
+
+    charges_src[mol_a_idxs] = mol_a_charges[np.array([amm.c_to_a[x] for x in mol_a_idxs])]
+    charges_dst[mol_b_idxs] = mol_b_charges[np.array([amm.c_to_b[x] for x in mol_b_idxs])]
+
+    # test that net charges are zero through-out the schedule.
+    for lamb in np.linspace(0, 1.0, 20):
+        charges = smoothcore_charge_interpolation(lamb, charges_src, charges_dst, mol_a_idxs, mol_b_idxs)
+        np.testing.assert_almost_equal(np.sum(charges), 0, decimal=5)
+
+    charges_lhs = smoothcore_charge_interpolation(0.0, charges_src, charges_dst, mol_a_idxs, mol_b_idxs)
+    charges_rhs = smoothcore_charge_interpolation(1.0, charges_src, charges_dst, mol_a_idxs, mol_b_idxs)
+
+    np.testing.assert_array_equal(charges_lhs, charges_src)
+    np.testing.assert_array_equal(charges_rhs, charges_dst)
+
+
+def test_smoothcore_lj_interpolation():
+    """
+    Test that the lj interpolation has the correct truncation treatment (upon absorption into
+    the anchoring atoms).
+    """
+    mol_a, mol_b, core = get_hif2a_ligand_pair_single_topology()
+
+    ff = Forcefield.load_default()
+
+    mol_a_lj = ff.lj_handle.parameterize(mol_a)
+    mol_b_lj = ff.lj_handle.parameterize(mol_b)
+    amm = AtomMapMixin(mol_a, mol_b, core)
+
+    lj_src = np.zeros((amm.get_num_atoms(), 2))
+    lj_dst = np.zeros((amm.get_num_atoms(), 2))
+
+    mol_a_idxs = amm.mol_a_idxs()
+    mol_b_idxs = amm.mol_b_idxs()
+
+    lj_src[mol_a_idxs] = mol_a_lj[np.array([amm.c_to_a[x] for x in mol_a_idxs])]
+    lj_dst[mol_b_idxs] = mol_b_lj[np.array([amm.c_to_b[x] for x in mol_b_idxs])]
+
+    lj_lhs = smoothcore_lj_interpolation(0.0, lj_src, lj_dst, mol_a_idxs, mol_b_idxs)
+    lj_rhs = smoothcore_lj_interpolation(1.0, lj_src, lj_dst, mol_a_idxs, mol_b_idxs)
+
+    np.testing.assert_array_equal(lj_lhs, lj_src)
+    np.testing.assert_array_equal(lj_rhs, lj_dst)
+    assert np.all(lj_lhs[amm.dummy_b_idxs(), 0] == 0)
+    assert np.all(lj_lhs[amm.dummy_b_idxs(), 1] == 0)
+    assert np.all(lj_rhs[amm.dummy_a_idxs(), 0] == 0)
+    assert np.all(lj_rhs[amm.dummy_a_idxs(), 1] == 0)
+
+    lj_almost_lhs = smoothcore_lj_interpolation(0.0001, lj_src, lj_dst, mol_a_idxs, mol_b_idxs)
+    assert np.all(lj_almost_lhs[amm.dummy_b_idxs(), 0] >= MIN_SMOOTHCORE_SIG_HALF)
+    assert np.all(lj_almost_lhs[amm.dummy_b_idxs(), 1] >= MIN_SMOOTHCORE_EPS_SQRT)
+
+    lj_almost_rhs = smoothcore_lj_interpolation(0.9999, lj_src, lj_dst, mol_a_idxs, mol_b_idxs)
+    assert np.all(lj_almost_rhs[amm.dummy_a_idxs(), 0] >= MIN_SMOOTHCORE_SIG_HALF)
+    assert np.all(lj_almost_rhs[amm.dummy_a_idxs(), 1] >= MIN_SMOOTHCORE_EPS_SQRT)
+
+
+def test_smoothcore_ligand_anchor_idxs():
+    mol_a, mol_b, core = get_hif2a_ligand_pair_single_topology()
+    ff = Forcefield.load_default()
+    st = SingleTopology(mol_a, mol_b, core, ff)
+
+    ligand_anchor_idxs = np.zeros(st.get_num_atoms(), dtype=np.int32)
+    for c_idx, c_flag in enumerate(st.c_flags):
+        if c_flag == AtomMapFlags.CORE:
+            ligand_anchor_idxs[c_idx] = c_idx
+
+    st = SingleTopology(mol_a, mol_b, core, ff)
+
+    # anchors implied by the atom-mapping, manually generated by means of eye-balling
+    ligand_anchor_idxs[st.a_to_c[22]] = st.a_to_c[0]
+    ligand_anchor_idxs[st.a_to_c[24]] = st.a_to_c[0]
+    ligand_anchor_idxs[st.a_to_c[25]] = st.a_to_c[0]
+    ligand_anchor_idxs[st.a_to_c[33]] = st.a_to_c[0]
+    ligand_anchor_idxs[st.b_to_c[23]] = st.b_to_c[22]
+
+    test_ligand_anchor_idxs = st.get_smoothcore_ligand_anchor_idxs()
+
+    np.testing.assert_array_equal(test_ligand_anchor_idxs, ligand_anchor_idxs)
+
+
+def test_smoothcore_potential():
+    """
+    Test that the smooth core potential on the minimized hif2a system.
+
+    Verifies:
+
+    1) Agreement with reference jax implementation
+    2) Agreement with existing NonbondedInteractionGroup at the end-states
+
+    """
+
+    mol_a, mol_b, core = get_hif2a_ligand_pair_single_topology()
+    x_a = get_romol_conf(mol_a)
+    x_b = get_romol_conf(mol_b)
+
+    ff = Forcefield.load_default()
+    host_conf, box, host_params, host_system, num_water_atoms = get_hif2a_protein_coords_box_params(
+        mol_a, mol_b, ff, minimize=True
+    )
+
+    ref_beta = host_system.nonbonded.potential.beta
+    ref_cutoff = host_system.nonbonded.potential.cutoff
+
+    num_host_atoms = len(host_conf)
+
+    st = SingleTopology(mol_a, mol_b, core, ff, use_smoothcore=True)
+    row_atom_idxs = np.arange(st.get_num_atoms()) + num_host_atoms
+    col_atom_idxs = np.arange(num_host_atoms)
+
+    ligand_anchor_idxs = st.get_smoothcore_ligand_anchor_idxs()
+    ligand_anchor_idxs += num_host_atoms
+    anchor_idxs = np.concatenate([np.arange(num_host_atoms), ligand_anchor_idxs])
+
+    xs = []
+    test_nrgs = []
+    test_params = []
+
+    total_num_atoms = num_host_atoms + st.get_num_atoms()
+    ref_pot = SmoothcoreNonbondedInteractionGroup(
+        total_num_atoms,
+        row_atom_idxs.astype(np.int32),
+        anchor_idxs.astype(np.int32),
+        ref_beta,
+        ref_cutoff,
+        col_atom_idxs.astype(np.int32),
+    )
+
+    for lamb in np.linspace(0, 1, 11):
+        ligand_params = st._get_smoothcore_guest_params(ff.q_handle, ff.lj_handle, lamb)
+        combined_params = np.concatenate([host_params, ligand_params])
+        ligand_conf = st.combine_confs(x_a, x_b, lamb)
+        combined_conf = np.concatenate([host_conf, ligand_conf])
+        test_params.append(combined_params)
+        ref_nrg = ref_pot(combined_conf, combined_params, box)
+        xs.append(combined_conf)
+        test_nrgs.append(ref_nrg)
+
+        for precision, rtol, atol in [(np.float64, 1e-8, 1e-8), (np.float32, 5e-4, 5e-4)]:
+            test_potential, combined_params = st._parameterize_unbound_host_guest_nonbonded_ixn(
+                lamb,
+                host_system.nonbonded,
+                num_water_atoms,
+            )
+            test_impl = test_potential.to_gpu(precision)
+            GradientTest().compare_forces(combined_conf, combined_params, box, ref_pot, test_impl, rtol=rtol, atol=atol)
+
+    # test that reference gpu, reference cpu, and the smoothcore are consistent at the end-states
+    st = SingleTopology(mol_a, mol_b, core, ff, use_smoothcore=False)
+    hg_system_lhs = st.combine_with_host(host_system, 0.0, num_water_atoms)
+    ref_u_lhs_gpu = hg_system_lhs.nonbonded_host_guest_ixn(xs[0], box)
+    ref_params_lhs = hg_system_lhs.nonbonded_host_guest_ixn.params.reshape(-1, 4)
+    ref_u_lhs_cpu = reference_nonbonded_interaction_group(
+        xs[0], ref_params_lhs, box, row_atom_idxs, col_atom_idxs, beta=ref_beta, cutoff=ref_cutoff
+    )
+    np.testing.assert_allclose(test_nrgs[0], ref_u_lhs_cpu)
+    np.testing.assert_allclose(ref_u_lhs_gpu, ref_u_lhs_cpu)
+
+    hg_system_rhs = st.combine_with_host(host_system, 1.0, num_water_atoms)
+    ref_u_rhs_gpu = hg_system_rhs.nonbonded_host_guest_ixn(xs[-1], box)
+    ref_params_rhs = hg_system_rhs.nonbonded_host_guest_ixn.params.reshape(-1, 4)
+    ref_u_rhs_cpu = reference_nonbonded_interaction_group(
+        xs[-1], ref_params_rhs, box, row_atom_idxs, col_atom_idxs, beta=ref_beta, cutoff=ref_cutoff
+    )
+    np.testing.assert_allclose(test_nrgs[-1], ref_u_rhs_cpu)
+    np.testing.assert_allclose(ref_u_rhs_gpu, ref_u_rhs_cpu)
diff --git a/timemachine/cpp/CMakeLists.txt b/timemachine/cpp/CMakeLists.txt
index 7a7d4145af..3219528a04 100644
--- a/timemachine/cpp/CMakeLists.txt
+++ b/timemachine/cpp/CMakeLists.txt
@@ -73,6 +73,7 @@ pybind11_add_module(${LIBRARY_NAME} SHARED NO_EXTRAS
   src/nonbonded_all_pairs.cu
   src/nonbonded_pair_list.cu
   src/nonbonded_interaction_group.cu
+  src/smoothcore_nonbonded_interaction_group.cu
   src/energy_accumulation.cu
   src/local_md_utils.cu
   src/neighborlist.cu
diff --git a/timemachine/cpp/src/kernels/k_nonbonded.cuh b/timemachine/cpp/src/kernels/k_nonbonded.cuh
index 5fe32a9a88..986e93dc17 100644
--- a/timemachine/cpp/src/kernels/k_nonbonded.cuh
+++ b/timemachine/cpp/src/kernels/k_nonbonded.cuh
@@ -54,6 +54,103 @@ void __global__ k_check_rebuild_coords_and_box_gather(
     }
 }
 
+template <typename RealType, int COORDS_DIM, int PARAMS_DIM>
+void __global__ k_smoothcore_rescale_coords(
+    const int N,
+    const unsigned int *__restrict__ row_atom_idxs,
+    const unsigned int *__restrict__ row_anchor_idxs,
+    const double *__restrict__ coords,
+    const double *__restrict__ params,
+    double *__restrict__ coords_out) {
+    static_assert(COORDS_DIM == 3);
+    static_assert(PARAMS_DIM == 4);
+    int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (atom_idx >= N) {
+        return;
+    }
+
+    unsigned int anchor_idx = row_anchor_idxs[atom_idx];
+    RealType rescale =
+        static_cast<RealType>(1.0) - static_cast<RealType>(params[atom_idx * PARAMS_DIM + 3]); // old w_coordinates
+
+    // Coords have 3 dimensions, params have 4
+#pragma unroll COORDS_DIM
+    for (int i = 0; i < COORDS_DIM; i++) {
+        RealType atom_pos = coords[atom_idx * COORDS_DIM + i];
+        RealType anchor_pos = coords[anchor_idx * COORDS_DIM + i];
+        coords_out[atom_idx * COORDS_DIM + i] = anchor_pos + rescale * (atom_pos - anchor_pos);
+    }
+}
+
+template <typename RealType, int COORDS_DIM, int PARAMS_DIM>
+void __global__ k_smoothcore_rescale_and_accum_du_dp(
+    const int N,
+    const unsigned int *__restrict__ anchor_idxs,
+    const double *__restrict__ coords,
+    const double *__restrict__ params,
+    const unsigned int *__restrict__ perm,
+    const unsigned long long *__restrict__ sorted_du_dp,
+    const unsigned long long *__restrict__ sorted_du_dx,
+    unsigned long long *__restrict__ du_dp) {
+    static_assert(COORDS_DIM == 3);
+    static_assert(PARAMS_DIM == 4);
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) {
+        return;
+    }
+
+    int atom_idx = perm[tid];
+    unsigned int anchor_idx = anchor_idxs[atom_idx];
+    RealType accum_w = 0;
+#pragma unroll COORDS_DIM
+    for (int i = 0; i < COORDS_DIM; i++) {
+        RealType atom_pos = coords[atom_idx * COORDS_DIM + i];
+        RealType anchor_pos = coords[anchor_idx * COORDS_DIM + i];
+        RealType d_du_dx_real = FIXED_TO_FLOAT<RealType>(sorted_du_dx[tid * COORDS_DIM + i]);
+        accum_w += -(atom_pos - anchor_pos) * d_du_dx_real;
+    }
+
+    atomicAdd(du_dp + atom_idx * PARAMS_DIM + 0, sorted_du_dp[tid * PARAMS_DIM + 0]);
+    atomicAdd(du_dp + atom_idx * PARAMS_DIM + 1, sorted_du_dp[tid * PARAMS_DIM + 1]);
+    atomicAdd(du_dp + atom_idx * PARAMS_DIM + 2, sorted_du_dp[tid * PARAMS_DIM + 2]);
+    atomicAdd(du_dp + atom_idx * PARAMS_DIM + 3, FLOAT_TO_FIXED_DU_DP<double, FIXED_EXPONENT_DU_DW>(accum_w));
+}
+
+template <typename RealType, int COORDS_DIM, int PARAMS_DIM>
+void __global__ k_smoothcore_rescale_and_accum_du_dx(
+    const int N,
+    const unsigned int *__restrict__ anchor_idxs,
+    const double *__restrict__ coords,
+    const double *__restrict__ params,
+    const unsigned int *__restrict__ perm,
+    const unsigned long long *__restrict__ sorted_du_dx,
+    unsigned long long *__restrict__ du_dx_out) {
+
+    static_assert(COORDS_DIM == 3);
+    static_assert(PARAMS_DIM == 4);
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= N) {
+        return;
+    }
+
+    int atom_idx = perm[tid];
+    unsigned int anchor_idx = anchor_idxs[atom_idx];
+#pragma unroll COORDS_DIM
+    for (int i = 0; i < COORDS_DIM; i++) {
+        RealType atom_pos = coords[atom_idx * COORDS_DIM + i];
+        RealType anchor_pos = coords[anchor_idx * COORDS_DIM + i];
+        RealType du_dx_hat = FIXED_TO_FLOAT<RealType>(sorted_du_dx[tid * COORDS_DIM + i]);
+        RealType w_i = params[atom_idx * PARAMS_DIM + 3];
+        RealType dxhat_danchor = w_i;
+        RealType dxhat_datom = 1 - w_i;
+        RealType du_danchor = du_dx_hat * dxhat_danchor;
+        RealType du_datom = du_dx_hat * dxhat_datom;
+
+        atomicAdd(du_dx_out + anchor_idx * COORDS_DIM + i, FLOAT_TO_FIXED_NONBONDED(du_danchor));
+        atomicAdd(du_dx_out + atom_idx * COORDS_DIM + i, FLOAT_TO_FIXED_NONBONDED(du_datom));
+    }
+}
+
 template <typename RealType, int COORDS_DIM, int PARAMS_DIM>
 void __global__ k_gather_coords_and_params(
     const int N,
@@ -82,6 +179,38 @@ void __global__ k_gather_coords_and_params(
     }
 }
 
+template <typename RealType, int COORDS_DIM, int PARAMS_DIM>
+void __global__ k_smoothcore_gather_coords_and_params_zero_w(
+    const int N,
+    const unsigned int *__restrict__ idxs,
+    const RealType *__restrict__ coords,
+    const RealType *__restrict__ params,
+    RealType *__restrict__ gathered_coords,
+    RealType *__restrict__ gathered_params) {
+    static_assert(COORDS_DIM == 3);
+    static_assert(PARAMS_DIM == PARAMS_PER_ATOM);
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N) {
+        return;
+    }
+
+    const unsigned int atom_idx = idxs[idx];
+
+    // Coords have 3 dimensions, params have 4
+#pragma unroll COORDS_DIM
+    for (int i = 0; i < COORDS_DIM; i++) {
+        gathered_coords[idx * COORDS_DIM + i] = coords[atom_idx * COORDS_DIM + i];
+    }
+#pragma unroll PARAMS_DIM
+    for (int i = 0; i < PARAMS_DIM; i++) {
+        if (i == PARAMS_DIM - 1) {
+            gathered_params[idx * PARAMS_DIM + 3] = 0; // zero out w coords
+        } else {
+            gathered_params[idx * PARAMS_DIM + i] = params[atom_idx * PARAMS_DIM + i];
+        }
+    }
+}
+
 template <typename T, int D>
 void __global__ k_scatter_accum(
     const int N,
@@ -201,6 +330,7 @@ void __device__ v_nonbonded_unified(
         RealType d2ij = delta_x * delta_x + delta_y * delta_y + delta_z * delta_z;
         RealType delta_w;
 
+        // w coords seem messed up here  - i.e. they're being double computed somehow
         if (ALCHEMICAL) {
             // (ytz): we are guaranteed that delta_w is zero if ALCHEMICAL == false
             delta_w = w_i - w_j;
@@ -322,6 +452,10 @@ void __device__ v_nonbonded_unified(
             atomicAdd(du_dp + lj_param_idx_eps_j, g_epsj);
             atomicAdd(du_dp + w_param_idx_j, g_wj);
         }
+
+        // if(atom_i_idx == 168 || atom_j_idx == 168) {
+        //     printf("inner debug %d %d | %f %f %f %f\n", atom_i_idx, atom_j_idx, FIXED_TO_FLOAT_DU_DP<RealType, FIXED_EXPONENT_DU_DCHARGE>(g_qi), FIXED_TO_FLOAT_DU_DP<RealType, FIXED_EXPONENT_DU_DCHARGE>(g_qj), FIXED_TO_FLOAT_DU_DP<RealType, FIXED_EXPONENT_DU_DEPS>(g_epsi), FIXED_TO_FLOAT_DU_DP<RealType, FIXED_EXPONENT_DU_DEPS>(g_epsj));
+        // }
     }
 }
 
diff --git a/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.cu b/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.cu
new file mode 100644
index 0000000000..5c6290c764
--- /dev/null
+++ b/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.cu
@@ -0,0 +1,393 @@
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "device_buffer.hpp"
+#include "energy_accumulation.hpp"
+#include "fixed_point.hpp"
+#include "gpu_utils.cuh"
+#include "kernel_utils.cuh"
+#include "kernels/k_indices.cuh"
+#include "nonbonded_common.hpp"
+#include "set_utils.hpp"
+#include "smoothcore_nonbonded_interaction_group.hpp"
+
+#include "k_nonbonded.cuh"
+
+static const int STEPS_PER_SORT = 200;
+
+namespace timemachine {
+
+template <typename RealType>
+SmoothcoreNonbondedInteractionGroup<RealType>::SmoothcoreNonbondedInteractionGroup(
+    const int N,
+    const std::vector<int> &row_atom_idxs,
+    const std::vector<int> &col_atom_idxs,
+    const std::vector<int> &anchor_atom_idxs,
+    const double beta,
+    const double cutoff,
+    const bool disable_hilbert_sort,
+    const double nblist_padding)
+    : N_(N), NR_(row_atom_idxs.size()), NC_(col_atom_idxs.size()),
+
+      kernel_ptrs_({// enumerate over every possible kernel combination
+                    // Set threads to 1 if not computing energy to reduced unused shared memory
+                    // U: Compute U
+                    // X: Compute DU_DX
+                    // P: Compute DU_DP
+                    //                                                                 U  X  P
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 0, 0, 0>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 0, 0, 1>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 0, 1, 0>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 0, 1, 1>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 1, 0, 0>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 1, 0, 1>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 1, 1, 0>,
+                    &k_nonbonded_unified<RealType, NONBONDED_KERNEL_THREADS_PER_BLOCK, 1, 1, 1>}),
+
+      beta_(beta), cutoff_(cutoff), steps_since_last_sort_(0), nblist_(N_), nblist_padding_(nblist_padding),
+      hilbert_sort_(nullptr), disable_hilbert_(disable_hilbert_sort) {
+
+    this->validate_idxs(N_, row_atom_idxs, col_atom_idxs, false);
+
+    cudaSafeMalloc(&d_col_atom_idxs_, N_ * sizeof(*d_col_atom_idxs_));
+    cudaSafeMalloc(&d_row_atom_idxs_, N_ * sizeof(*d_row_atom_idxs_));
+    cudaSafeMalloc(&d_anchor_idxs_, N_ * sizeof(*d_anchor_idxs_));
+    cudaSafeMalloc(&d_u_buffer_, NONBONDED_KERNEL_BLOCKS * sizeof(*d_u_buffer_));
+
+    cudaSafeMalloc(&d_perm_, N_ * sizeof(*d_perm_));
+    cudaSafeMalloc(&d_rescaled_x_, N_ * 3 * sizeof(*d_rescaled_x_));
+    cudaSafeMalloc(&d_sorted_x_, N_ * 3 * sizeof(*d_sorted_x_));
+
+    cudaSafeMalloc(&d_sorted_p_, N_ * PARAMS_PER_ATOM * sizeof(*d_sorted_p_));
+    cudaSafeMalloc(&d_sorted_du_dx_, N_ * 3 * sizeof(*d_sorted_du_dx_));
+    cudaSafeMalloc(&d_sorted_du_dp_, N_ * PARAMS_PER_ATOM * sizeof(*d_sorted_du_dp_));
+
+    cudaSafeMalloc(&d_nblist_x_, N_ * 3 * sizeof(*d_nblist_x_));
+    gpuErrchk(cudaMemset(d_nblist_x_, 0, N_ * 3 * sizeof(*d_nblist_x_))); // set non-sensical positions
+    cudaSafeMalloc(&d_nblist_box_, 3 * 3 * sizeof(*d_nblist_box_));
+    gpuErrchk(cudaMemset(d_nblist_box_, 0, 3 * 3 * sizeof(*d_nblist_box_)));
+    cudaSafeMalloc(&d_rebuild_nblist_, 1 * sizeof(*d_rebuild_nblist_));
+    gpuErrchk(cudaMallocHost(&p_rebuild_nblist_, 1 * sizeof(*p_rebuild_nblist_)));
+
+    if (!disable_hilbert_) {
+        this->hilbert_sort_.reset(new HilbertSort(N_));
+    }
+
+    this->set_atom_idxs(row_atom_idxs, col_atom_idxs);
+
+    gpuErrchk(cudaMemcpy(d_anchor_idxs_, &anchor_atom_idxs[0], N_ * sizeof(*d_anchor_idxs_), cudaMemcpyHostToDevice));
+
+    // Create event with timings disabled as timings slow down events
+    gpuErrchk(cudaEventCreateWithFlags(&nblist_flag_sync_event_, cudaEventDisableTiming));
+};
+
+template <typename RealType> SmoothcoreNonbondedInteractionGroup<RealType>::~SmoothcoreNonbondedInteractionGroup() {
+    gpuErrchk(cudaFree(d_col_atom_idxs_));
+    gpuErrchk(cudaFree(d_row_atom_idxs_));
+    gpuErrchk(cudaFree(d_anchor_idxs_));
+
+    gpuErrchk(cudaFree(d_perm_));
+    gpuErrchk(cudaFree(d_rescaled_x_));
+    gpuErrchk(cudaFree(d_sorted_x_));
+    gpuErrchk(cudaFree(d_u_buffer_));
+
+    gpuErrchk(cudaFree(d_sorted_p_));
+    gpuErrchk(cudaFree(d_sorted_du_dx_));
+    gpuErrchk(cudaFree(d_sorted_du_dp_));
+
+    gpuErrchk(cudaFree(d_nblist_x_));
+    gpuErrchk(cudaFree(d_nblist_box_));
+    gpuErrchk(cudaFree(d_rebuild_nblist_));
+    gpuErrchk(cudaFreeHost(p_rebuild_nblist_));
+
+    gpuErrchk(cudaEventDestroy(nblist_flag_sync_event_));
+};
+
+template <typename RealType> bool SmoothcoreNonbondedInteractionGroup<RealType>::needs_sort() {
+    return steps_since_last_sort_ % STEPS_PER_SORT == 0;
+}
+
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::sort(
+    const double *d_coords, const double *d_box, cudaStream_t stream) {
+    // We must rebuild the neighborlist after sorting, as the neighborlist is tied to a particular sort order
+    if (!disable_hilbert_) {
+        this->hilbert_sort_->sort_device(NR_, d_row_atom_idxs_, d_coords, d_box, d_perm_, stream);
+        this->hilbert_sort_->sort_device(NC_, d_col_atom_idxs_, d_coords, d_box, d_perm_ + NR_, stream);
+    } else {
+        gpuErrchk(cudaMemcpyAsync(
+            d_perm_, d_row_atom_idxs_, NR_ * sizeof(*d_row_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(
+            d_perm_ + NR_, d_col_atom_idxs_, NC_ * sizeof(*d_col_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
+    }
+    gpuErrchk(cudaMemsetAsync(d_rebuild_nblist_, 1, sizeof(*d_rebuild_nblist_), stream));
+    // Set the pinned memory to indicate that we need to rebuild
+    p_rebuild_nblist_[0] = 1;
+}
+
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::execute_device(
+    const int N,
+    const int P,
+    const double *d_x,
+    const double *d_p,   // N * PARAMS_PER_ATOM
+    const double *d_box, // 3 * 3
+    unsigned long long *d_du_dx,
+    unsigned long long *d_du_dp,
+    __int128 *d_u,
+    cudaStream_t stream) {
+
+    // (ytz) the nonbonded algorithm proceeds as follows:
+
+    // (done in constructor), construct a hilbert curve mapping each of the HILBERT_GRID_DIM x HILBERT_GRID_DIM x HILBERT_GRID_DIM cells into an index.
+    // a. decide if we need to rebuild the neighborlist, if so:
+    //     - look up which cell each particle belongs to, and its linear index along the hilbert curve.
+    //     - use radix pair sort keyed on the hilbert index with values equal to the atomic index
+    //     - resulting sorted values is the permutation array.
+    //     - permute coords
+    // b. else:
+    //     - permute new coords
+    // c. permute parameters
+    // d. compute the nonbonded interactions using the neighborlist
+    // e. inverse permute the forces, du/dps into the original index.
+    // f. u is buffered into a per-particle array, and then reduced.
+
+    if (N != N_) {
+        throw std::runtime_error(
+            "SmoothcoreNonbondedInteractionGroup::execute_device(): expected N == N_, got N=" + std::to_string(N) +
+            ", N_=" + std::to_string(N_));
+    }
+
+    if (P != N_ * PARAMS_PER_ATOM) {
+        throw std::runtime_error(
+            "SmoothcoreNonbondedInteractionGroup::execute_device(): expected P == N_*" +
+            std::to_string(PARAMS_PER_ATOM) + ", got P=" + std::to_string(P) + ", N_*" +
+            std::to_string(PARAMS_PER_ATOM) + "=" + std::to_string(N_ * PARAMS_PER_ATOM));
+    }
+
+    // If the size of the row or cols is none, exit
+    if (NR_ == 0 || NC_ == 0) {
+        return;
+    }
+
+    // initialize rescaled coordinates
+    gpuErrchk(cudaMemcpyAsync(d_rescaled_x_, d_x, N_ * 3 * sizeof(*d_rescaled_x_), cudaMemcpyDeviceToDevice, stream));
+
+    const int tpb = DEFAULT_THREADS_PER_BLOCK;
+    const int B = ceil_divide(N_, tpb);
+
+    k_smoothcore_rescale_coords<RealType, 3, PARAMS_PER_ATOM>
+        <<<B, tpb, 0, stream>>>(N_, nblist_.get_row_idxs(), d_anchor_idxs_, d_x, d_p, d_rescaled_x_);
+    gpuErrchk(cudaPeekAtLastError());
+
+    const int K = NR_ + NC_; // total number of interactions
+    const int B_K = ceil_divide(K, tpb);
+
+    if (this->needs_sort()) {
+        // Sorting always triggers a neighborlist rebuild
+        this->sort(d_rescaled_x_, d_box, stream);
+    } else {
+        // (ytz) see if we need to rebuild the neighborlist.
+        // Reuse the d_perm_ here to avoid having to make two kernels calls.
+        k_check_rebuild_coords_and_box_gather<RealType><<<B_K, tpb, 0, stream>>>(
+            NR_ + NC_, d_perm_, d_rescaled_x_, d_nblist_x_, d_box, d_nblist_box_, nblist_padding_, d_rebuild_nblist_);
+        gpuErrchk(cudaPeekAtLastError());
+        // we can optimize this away by doing the check on the GPU directly.
+        gpuErrchk(cudaMemcpyAsync(
+            p_rebuild_nblist_, d_rebuild_nblist_, 1 * sizeof(*p_rebuild_nblist_), cudaMemcpyDeviceToHost, stream));
+        gpuErrchk(cudaEventRecord(nblist_flag_sync_event_, stream));
+    }
+
+    // compute new coordinates/params
+    k_smoothcore_gather_coords_and_params_zero_w<double, 3, PARAMS_PER_ATOM>
+        <<<ceil_divide(K, tpb), tpb, 0, stream>>>(K, d_perm_, d_rescaled_x_, d_p, d_sorted_x_, d_sorted_p_);
+    gpuErrchk(cudaPeekAtLastError());
+    // reset buffers and sorted accumulators
+    if (d_du_dx || d_du_dp) {
+        gpuErrchk(cudaMemsetAsync(d_sorted_du_dx_, 0, K * 3 * sizeof(*d_sorted_du_dx_), stream));
+    }
+    if (d_du_dp) {
+        gpuErrchk(cudaMemsetAsync(d_sorted_du_dp_, 0, K * PARAMS_PER_ATOM * sizeof(*d_sorted_du_dp_), stream));
+    }
+
+    // Syncing to an event allows having additional kernels run while we synchronize
+    // Note that if no event is recorded, this is effectively a no-op, such as in the case of sorting.
+    gpuErrchk(cudaEventSynchronize(nblist_flag_sync_event_));
+    if (p_rebuild_nblist_[0] > 0) {
+        nblist_.build_nblist_device(K, d_sorted_x_, d_box, cutoff_ + nblist_padding_, stream);
+        gpuErrchk(cudaMemsetAsync(d_rebuild_nblist_, 0, sizeof(*d_rebuild_nblist_), stream));
+        gpuErrchk(cudaMemcpyAsync(
+            d_nblist_x_, d_rescaled_x_, N * 3 * sizeof(*d_rescaled_x_), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(d_nblist_box_, d_box, 3 * 3 * sizeof(*d_box), cudaMemcpyDeviceToDevice, stream));
+    }
+
+    // look up which kernel we need for this computation
+    int kernel_idx = 0;
+    kernel_idx |= d_du_dp ? 1 << 0 : 0;
+    kernel_idx |= (d_du_dx || d_du_dp) ? 1 << 1 : 0; // we need forces if we request d_du_dp
+    kernel_idx |= d_u ? 1 << 2 : 0;
+
+    // note that these kernels will no longer trigger the "ALCHEMICAL=1" template option since we've set
+    // the w-params to be strictly 0.
+    kernel_ptrs_[kernel_idx]<<<NONBONDED_KERNEL_BLOCKS, NONBONDED_KERNEL_THREADS_PER_BLOCK, 0, stream>>>(
+        K,
+        nblist_.get_num_row_idxs(),
+        nblist_.get_ixn_count(),
+        d_sorted_x_,
+        d_sorted_p_,
+        d_box,
+        beta_,
+        cutoff_,
+        nblist_.get_row_idxs(),
+        nblist_.get_ixn_tiles(),
+        nblist_.get_ixn_atoms(),
+        d_sorted_du_dx_,
+        d_sorted_du_dp_,
+        d_u == nullptr ? nullptr : d_u_buffer_ // switch to nullptr if we don't request energies
+    );
+    gpuErrchk(cudaPeekAtLastError());
+
+    // coords are (N,3)
+    if (d_du_dx) {
+        k_smoothcore_rescale_and_accum_du_dx<RealType, 3, PARAMS_PER_ATOM>
+            <<<B, tpb, 0, stream>>>(N_, d_anchor_idxs_, d_x, d_p, d_perm_, d_sorted_du_dx_, d_du_dx);
+        gpuErrchk(cudaPeekAtLastError());
+    }
+
+    // params are (N, PARAMS_PER_ATOM)
+    if (d_du_dp) {
+        k_smoothcore_rescale_and_accum_du_dp<RealType, 3, PARAMS_PER_ATOM>
+            <<<B, tpb, 0, stream>>>(N_, d_anchor_idxs_, d_x, d_p, d_perm_, d_sorted_du_dp_, d_sorted_du_dx_, d_du_dp);
+        gpuErrchk(cudaPeekAtLastError());
+    }
+    if (d_u) {
+        accumulate_energy(NONBONDED_KERNEL_BLOCKS, d_u_buffer_, d_u, stream);
+    }
+    // Increment steps
+    steps_since_last_sort_++;
+}
+
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::set_atom_idxs(
+    const std::vector<int> &row_atom_idxs, const std::vector<int> &col_atom_idxs) {
+
+    this->validate_idxs(N_, row_atom_idxs, col_atom_idxs, true);
+
+    std::vector<unsigned int> unsigned_row_idxs = std::vector<unsigned int>(row_atom_idxs.begin(), row_atom_idxs.end());
+    std::set<unsigned int> unique_row_atom_idxs(unique_idxs(unsigned_row_idxs));
+
+    std::vector<unsigned int> row_atom_idxs_v(set_to_vector(unique_row_atom_idxs));
+    std::vector<unsigned int> col_atom_idxs_v(col_atom_idxs.begin(), col_atom_idxs.end());
+
+    cudaStream_t stream = static_cast<cudaStream_t>(0);
+    if (row_atom_idxs_v.size() == 0 || row_atom_idxs_v.size() == N_) {
+        this->set_atom_idxs_device(col_atom_idxs_v.size(), row_atom_idxs_v.size(), nullptr, nullptr, stream);
+    } else {
+        DeviceBuffer<unsigned int> d_col(col_atom_idxs_v);
+        DeviceBuffer<unsigned int> d_row(row_atom_idxs_v);
+
+        this->set_atom_idxs_device(col_atom_idxs_v.size(), row_atom_idxs_v.size(), d_col.data, d_row.data, stream);
+    }
+    gpuErrchk(cudaStreamSynchronize(stream));
+}
+
+// set_atom_idxs_device is for use when idxs exist on the GPU already and are used as the new idxs to compute the neighborlist on.
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::set_atom_idxs_device(
+    const int NC,
+    const int NR,
+    unsigned int *d_in_column_idxs,
+    unsigned int *d_in_row_idxs,
+    const cudaStream_t stream) {
+
+    if (NC + NR > N_) {
+        throw std::runtime_error("number of idxs must be less than or equal to N");
+    }
+    if (NR > 0 && NC > 0) {
+        const size_t tpb = DEFAULT_THREADS_PER_BLOCK;
+
+        // The indices must already be on the GPU and are copied into the potential's buffers.
+        gpuErrchk(cudaMemcpyAsync(
+            d_col_atom_idxs_, d_in_column_idxs, NC * sizeof(*d_col_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
+        gpuErrchk(cudaMemcpyAsync(
+            d_row_atom_idxs_, d_in_row_idxs, NR * sizeof(*d_row_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
+
+        // The neighborlist does not use the indices directly, rather it takes a contiguous set of indices and the ixn group
+        // potential will resort the correct particles into the corresponding arrays. We can use the leftover spaces in the
+        // two d_*_atom_idxs_ arrays to store these nblist indices.
+        // NOTE: The leftover column indices will store the row indices and vice versa.
+        k_arange<<<ceil_divide(NR, tpb), tpb, 0, stream>>>(NR, d_col_atom_idxs_ + NC);
+        gpuErrchk(cudaPeekAtLastError());
+        k_arange<<<ceil_divide(NC, tpb), tpb, 0, stream>>>(NC, d_row_atom_idxs_ + NR, NR);
+        gpuErrchk(cudaPeekAtLastError());
+
+        // Resize the nblist
+        nblist_.resize_device(NC + NR, stream);
+
+        // Force a NBlist rebuild
+        gpuErrchk(cudaMemsetAsync(d_rebuild_nblist_, 1, 1 * sizeof(*d_rebuild_nblist_), stream));
+
+        // Offset into the ends of the arrays that now contain the row and column indices for the nblist
+        nblist_.set_idxs_device(NC, NR, d_row_atom_idxs_ + NR, d_col_atom_idxs_ + NC, stream);
+    }
+
+    // Update the row and column counts
+    this->NR_ = NR;
+    this->NC_ = NC;
+    // Reset the steps so that we do a new sort
+    this->steps_since_last_sort_ = 0;
+}
+
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::du_dp_fixed_to_float(
+    const int N, const int P, const unsigned long long *du_dp, double *du_dp_float) {
+
+    for (int i = 0; i < N; i++) {
+        const int idx = i * PARAMS_PER_ATOM;
+        const int idx_charge = idx + PARAM_OFFSET_CHARGE;
+        const int idx_sig = idx + PARAM_OFFSET_SIG;
+        const int idx_eps = idx + PARAM_OFFSET_EPS;
+        const int idx_w = idx + PARAM_OFFSET_W;
+
+        du_dp_float[idx_charge] = FIXED_TO_FLOAT_DU_DP<double, FIXED_EXPONENT_DU_DCHARGE>(du_dp[idx_charge]);
+        du_dp_float[idx_sig] = FIXED_TO_FLOAT_DU_DP<double, FIXED_EXPONENT_DU_DSIG>(du_dp[idx_sig]);
+        du_dp_float[idx_eps] = FIXED_TO_FLOAT_DU_DP<double, FIXED_EXPONENT_DU_DEPS>(du_dp[idx_eps]);
+        du_dp_float[idx_w] = FIXED_TO_FLOAT_DU_DP<double, FIXED_EXPONENT_DU_DW>(du_dp[idx_w]);
+    }
+}
+
+template <typename RealType>
+void SmoothcoreNonbondedInteractionGroup<RealType>::validate_idxs(
+    const int N, const std::vector<int> &row_atom_idxs, const std::vector<int> &col_atom_idxs, const bool allow_empty) {
+
+    if (!allow_empty) {
+        if (row_atom_idxs.size() == 0) {
+            throw std::runtime_error("row_atom_idxs must be nonempty");
+        }
+        if (col_atom_idxs.size() == 0) {
+            throw std::runtime_error("col_atom_idxs must be nonempty");
+        }
+        if (row_atom_idxs.size() == static_cast<long unsigned int>(N)) {
+            throw std::runtime_error("must be less then N(" + std::to_string(N) + ") row indices");
+        }
+        if (col_atom_idxs.size() == static_cast<long unsigned int>(N)) {
+            throw std::runtime_error("must be less then N(" + std::to_string(N) + ") col indices");
+        }
+    }
+    verify_atom_idxs(N, row_atom_idxs, allow_empty);
+    verify_atom_idxs(N, col_atom_idxs, allow_empty);
+
+    // row and col idxs must be disjoint
+    std::set<int> unique_row_idxs(row_atom_idxs.begin(), row_atom_idxs.end());
+    for (int col_atom_idx : col_atom_idxs) {
+        if (unique_row_idxs.find(col_atom_idx) != unique_row_idxs.end()) {
+            throw std::runtime_error("row and col indices must be disjoint");
+        }
+    }
+}
+
+template class SmoothcoreNonbondedInteractionGroup<double>;
+template class SmoothcoreNonbondedInteractionGroup<float>;
+
+} // namespace timemachine
diff --git a/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.hpp b/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.hpp
new file mode 100644
index 0000000000..4bf84e194e
--- /dev/null
+++ b/timemachine/cpp/src/smoothcore_nonbonded_interaction_group.hpp
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "hilbert_sort.hpp"
+#include "neighborlist.hpp"
+#include "nonbonded_common.hpp"
+#include "potential.hpp"
+#include <array>
+#include <memory>
+#include <optional>
+#include <vector>
+
+namespace timemachine {
+
+template <typename RealType> class SmoothcoreNonbondedInteractionGroup : public Potential {
+
+private:
+    const int N_; // total number of atoms, i.e. first dimension of input coords, params
+    int NR_;      // number of row atoms
+    int NC_;      // number of column atoms
+
+    std::array<k_nonbonded_fn, 8> kernel_ptrs_;
+
+    unsigned int *d_col_atom_idxs_;
+    unsigned int *d_row_atom_idxs_;
+    unsigned int *d_anchor_idxs_;
+    int *p_ixn_count_; // pinned memory
+
+    double beta_;
+    double cutoff_;
+    // This is safe to overflow, either reset to 0 or increment
+    unsigned int steps_since_last_sort_;
+    Neighborlist<RealType> nblist_;
+
+    const double nblist_padding_;
+    __int128 *d_u_buffer_;  // [NONBONDED_KERNEL_BLOCKS]
+    double *d_nblist_x_;    // coords which were used to compute the nblist
+    double *d_nblist_box_;  // box which was used to rebuild the nblist
+    int *d_rebuild_nblist_; // whether or not we have to rebuild the nblist
+    int *p_rebuild_nblist_; // pinned
+
+    unsigned int *d_perm_; // hilbert curve permutation
+
+    // "sorted" means
+    // - if hilbert sorting enabled, atoms are sorted into contiguous
+    //   blocks by interaction group, and each block is hilbert-sorted
+    //   independently
+    // - otherwise, atoms are sorted into contiguous blocks by
+    //   interaction group, with arbitrary ordering within each block
+    double *d_rescaled_x_; // rescaled coordinates
+    double *d_sorted_x_;   // sorted coordinates
+    double *d_sorted_p_;   // sorted parameters
+    unsigned long long *d_sorted_du_dx_;
+    unsigned long long *d_sorted_du_dp_;
+
+    std::unique_ptr<HilbertSort> hilbert_sort_;
+
+    cudaEvent_t nblist_flag_sync_event_; // Event to synchronize rebuild flag on
+
+    const bool disable_hilbert_;
+
+    bool needs_sort();
+
+    void sort(const double *d_x, const double *d_box, cudaStream_t stream);
+
+    void validate_idxs(
+        const int N,
+        const std::vector<int> &row_atom_idxs,
+        const std::vector<int> &col_atom_idxs,
+        const bool allow_empty);
+
+public:
+    void set_atom_idxs_device(
+        const int NC, const int NR, unsigned int *d_column_idxs, unsigned int *d_row_idxs, const cudaStream_t stream);
+
+    void set_atom_idxs(const std::vector<int> &row_atom_idxs, const std::vector<int> &col_atom_idxs);
+
+    SmoothcoreNonbondedInteractionGroup(
+        const int N,
+        const std::vector<int> &row_atom_idxs,
+        const std::vector<int> &col_atom_idxs,
+        const std::vector<int> &anchor_idxs,
+        const double beta,
+        const double cutoff,
+        const bool disable_hilbert_sort = false,
+        const double nblist_padding = 0.1);
+
+    ~SmoothcoreNonbondedInteractionGroup();
+
+    virtual void execute_device(
+        const int N,
+        const int P,
+        const double *d_x,
+        const double *d_p,
+        const double *d_box,
+        unsigned long long *d_du_dx,
+        unsigned long long *d_du_dp,
+        __int128 *d_u,
+        cudaStream_t stream) override;
+
+    void du_dp_fixed_to_float(const int N, const int P, const unsigned long long *du_dp, double *du_dp_float) override;
+};
+
+} // namespace timemachine
diff --git a/timemachine/cpp/src/wrap_kernels.cpp b/timemachine/cpp/src/wrap_kernels.cpp
index e592178b74..af42874068 100644
--- a/timemachine/cpp/src/wrap_kernels.cpp
+++ b/timemachine/cpp/src/wrap_kernels.cpp
@@ -40,6 +40,7 @@
 #include "segmented_sumexp.hpp"
 #include "segmented_weighted_random_sampler.hpp"
 #include "set_utils.hpp"
+#include "smoothcore_nonbonded_interaction_group.hpp"
 #include "summed_potential.hpp"
 #include "tibd_exchange_move.hpp"
 #include "translations.hpp"
@@ -1582,6 +1583,95 @@ template <typename RealType> void declare_nonbonded_interaction_group(py::module
             )pbdoc");
 }
 
+template <typename RealType> void declare_smoothcore_nonbonded_interaction_group(py::module &m, const char *typestr) {
+    using Class = SmoothcoreNonbondedInteractionGroup<RealType>;
+    std::string pyclass_name = std::string("SmoothcoreNonbondedInteractionGroup_") + typestr;
+    py::class_<Class, std::shared_ptr<Class>, Potential>(
+        m, pyclass_name.c_str(), py::buffer_protocol(), py::dynamic_attr())
+        .def(
+            "set_atom_idxs",
+            &SmoothcoreNonbondedInteractionGroup<RealType>::set_atom_idxs,
+            py::arg("row_atom_idxs"),
+            py::arg("col_atom_idxs"),
+            R"pbdoc(
+                    Set up the atom idxs for the SmoothcoreNonbondedInteractionGroup.
+                    The interaction is defined between two groups of atom idxs,
+                    `row_atom_idxs` and `col_atom_idxs`. These should be a disjoint
+                    list of idxs.
+
+                    Parameters
+                    ----------
+                    row_atom_idxs: NDArray
+                        First group of atoms in the interaction.
+
+                    col_atom_idxs: NDArray
+                        Second group of atoms in the interaction.
+
+            )pbdoc")
+        .def(
+            py::init([](const int N,
+                        const py::array_t<int, py::array::c_style> &row_atom_idxs_i,
+                        const py::array_t<int, py::array::c_style> &anchor_idxs_i,
+                        const double beta,
+                        const double cutoff,
+                        std::optional<py::array_t<int, py::array::c_style>> &col_atom_idxs_i,
+                        const bool disable_hilbert_sort,
+                        const double nblist_padding) {
+                std::vector<int> row_atom_idxs = py_array_to_vector(row_atom_idxs_i);
+                std::vector<int> anchor_idxs = py_array_to_vector(anchor_idxs_i);
+
+                std::vector<int> col_atom_idxs;
+                if (col_atom_idxs_i) {
+                    col_atom_idxs.resize(col_atom_idxs_i->size());
+                    std::memcpy(col_atom_idxs.data(), col_atom_idxs_i->data(), col_atom_idxs_i->size() * sizeof(int));
+                } else {
+                    std::set<int> unique_row_atom_idxs(unique_idxs(row_atom_idxs));
+                    col_atom_idxs = get_indices_difference(N, unique_row_atom_idxs);
+                }
+
+                return new SmoothcoreNonbondedInteractionGroup<RealType>(
+                    N, row_atom_idxs, col_atom_idxs, anchor_idxs, beta, cutoff, disable_hilbert_sort, nblist_padding);
+            }),
+            py::arg("num_atoms"),
+            py::arg("row_atom_idxs_i"),
+            py::arg("anchor_idxs_i"),
+            py::arg("beta"),
+            py::arg("cutoff"),
+            py::arg("col_atom_idxs_i") = py::none(),
+            py::arg("disable_hilbert_sort") = false,
+            py::arg("nblist_padding") = 0.1,
+            R"pbdoc(
+                    Set up the SmoothcoreNonbondedInteractionGroup.
+
+                    Parameters
+                    ----------
+                    num_atoms: int
+                        Number of atoms.
+
+                    row_atom_idxs: NDArray
+                        First group of atoms in the interaction.
+
+                    anchor_idxs: NDArray
+                        First group of atoms in the interaction.
+
+                    beta: float
+
+                    cutoff: float
+                        Ignore all interactions beyond this distance in nm.
+
+                    col_atom_idxs: Optional[NDArray]
+                        Second group of atoms in the interaction. If not specified,
+                        use all of the atoms not in the `row_atom_idxs`.
+
+                    disable_hilbert_sort: bool
+                        Set to True to disable the Hilbert sort.
+
+                    nblist_padding: float
+                        Margin for the neighborlist.
+
+            )pbdoc");
+}
+
 template <typename RealType, bool Negated> void declare_nonbonded_pair_list(py::module &m, const char *typestr) {
     using Class = NonbondedPairList<RealType, Negated>;
     std::string pyclass_name;
@@ -2235,6 +2325,9 @@ PYBIND11_MODULE(custom_ops, m) {
     declare_nonbonded_interaction_group<double>(m, "f64");
     declare_nonbonded_interaction_group<float>(m, "f32");
 
+    declare_smoothcore_nonbonded_interaction_group<double>(m, "f64");
+    declare_smoothcore_nonbonded_interaction_group<float>(m, "f32");
+
     declare_nonbonded_precomputed<double>(m, "f64");
     declare_nonbonded_precomputed<float>(m, "f32");
 
diff --git a/timemachine/fe/free_energy.py b/timemachine/fe/free_energy.py
index 6aa4195294..9418c3ba07 100644
--- a/timemachine/fe/free_energy.py
+++ b/timemachine/fe/free_energy.py
@@ -892,6 +892,7 @@ def run_sims_sequential(
     for initial_state in initial_states:
         # run simulation
         traj = sample(initial_state, md_params, max_buffer_frames=100)
+
         print(f"completed simulation at lambda={initial_state.lamb}!")
 
         # keep samples from any requested states in memory
@@ -904,6 +905,7 @@ def run_sims_sequential(
         # analysis that depends on current and previous state
         if prev_state:
             state_pair = [prev_state, state]
+
             u_kln_by_component = compute_energy_decomposed_u_kln(state_pair)
             u_kln_by_component_by_lambda.append(u_kln_by_component)
 
@@ -992,6 +994,8 @@ def get_state(lamb: float) -> EnergyDecomposedState[StoredArrays]:
         assert_potentials_compatible(initial_state.potentials, potentials_0)
 
         traj = get_samples(lamb)
+
+        print(f"traj at lambda={initial_state.lamb} coords max {np.amax(traj.frames)}, min {np.min(traj.frames)}")
         batch_u_fns = get_batch_u_fns(unbound_impls, [p.params for p in initial_state.potentials], temperature)
         return EnergyDecomposedState(traj.frames, traj.boxes, batch_u_fns)
 
diff --git a/timemachine/fe/rbfe.py b/timemachine/fe/rbfe.py
index 644f4c6a58..4b6d348735 100644
--- a/timemachine/fe/rbfe.py
+++ b/timemachine/fe/rbfe.py
@@ -534,6 +534,7 @@ def estimate_relative_free_energy_bisection(
     n_windows: Optional[int] = None,
     min_overlap: Optional[float] = None,
     min_cutoff: Optional[float] = 0.7,
+    use_smoothcore: Optional[bool] = False,
 ) -> SimulationResult:
     r"""Estimate relative free energy between mol_a and mol_b via independent simulations with a dynamic lambda schedule
     determined by successively bisecting the lambda interval between the pair of states with the greatest BAR
@@ -588,7 +589,7 @@ def estimate_relative_free_energy_bisection(
         n_windows = DEFAULT_NUM_WINDOWS
     assert n_windows >= 2
 
-    single_topology = SingleTopology(mol_a, mol_b, core, ff)
+    single_topology = SingleTopology(mol_a, mol_b, core, ff, use_smoothcore)
 
     lambda_interval = lambda_interval or (0.0, 1.0)
     lambda_min, lambda_max = lambda_interval[0], lambda_interval[1]
diff --git a/timemachine/fe/single_topology.py b/timemachine/fe/single_topology.py
index 0cb73afc69..9c2614cb90 100644
--- a/timemachine/fe/single_topology.py
+++ b/timemachine/fe/single_topology.py
@@ -32,7 +32,6 @@
     HarmonicAngleStable,
     HarmonicBond,
     Nonbonded,
-    NonbondedInteractionGroup,
     NonbondedPairListPrecomputed,
     PeriodicTorsion,
 )
@@ -1121,6 +1120,66 @@ def __init__(self, mol_a, mol_b, core):
         self.c_to_a = {v: k for k, v in enumerate(self.a_to_c)}
         self.c_to_b = {v: k for k, v in enumerate(self.b_to_c)}
 
+    def mol_a_idxs(self):
+        """
+        Indices in the combined mol that correspond to an atom in mol_a
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag == AtomMapFlags.CORE or flag == AtomMapFlags.MOL_A:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
+    def mol_b_idxs(self):
+        """
+        Indices in the combined mol that correspond to an atom in mol_b
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag == AtomMapFlags.CORE or flag == AtomMapFlags.MOL_B:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
+    def core_idxs(self):
+        """
+        Indices corresponding to non-core atoms
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag == AtomMapFlags.CORE:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
+    def dummy_idxs(self):
+        """
+        Indices corresponding to non-core atoms
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag != AtomMapFlags.CORE:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
+    def dummy_a_idxs(self):
+        """
+        Indices corresponding to non-core atoms in mol_a
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag == AtomMapFlags.MOL_A:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
+    def dummy_b_idxs(self):
+        """
+        Indices corresponding to non-core atoms in mol_b
+        """
+        idxs = []
+        for c_idx, flag in enumerate(self.c_flags):
+            if flag == AtomMapFlags.MOL_B:
+                idxs.append(c_idx)
+        return np.array(idxs, dtype=np.int32)
+
     def get_num_atoms(self):
         """
         Get the total number of atoms in the alchemical hybrid.
@@ -1143,6 +1202,65 @@ def get_num_dummy_atoms(self):
         """
         return self.mol_a.GetNumAtoms() + self.mol_b.GetNumAtoms() - len(self.core) - len(self.core)
 
+    def combine_confs(self, x_a, x_b, lamb=1.0):
+        """
+        Combine conformations of two molecules.
+
+        TODO: interpolate confs based on the lambda value?
+
+        Parameters
+        ----------
+        x_a: np.array of shape (N_A,3)
+            First conformation
+
+        x_b: np.array of shape (N_B,3)
+            Second conformation
+
+        lamb: optional float
+            if lamb > 0.5, map atoms from x_a first, then overwrite with x_b,
+            otherwise use opposite order
+
+        Returns
+        -------
+        np.array of shape (self.num_atoms,3)
+            Combined conformation
+
+        """
+        if lamb < 0.5:
+            return self.combine_confs_lhs(x_a, x_b)
+        else:
+            return self.combine_confs_rhs(x_a, x_b)
+
+    def combine_confs_rhs(self, x_a, x_b):
+        """
+        Combine x_a and x_b conformations for lambda=1
+        """
+        # place a first, then b overrides a
+        assert x_a.shape == (self.mol_a.GetNumAtoms(), 3)
+        assert x_b.shape == (self.mol_b.GetNumAtoms(), 3)
+        x0 = np.zeros((self.get_num_atoms(), 3))
+        for src, dst in enumerate(self.a_to_c):
+            x0[dst] = x_a[src]
+        for src, dst in enumerate(self.b_to_c):
+            x0[dst] = x_b[src]
+
+        return x0
+
+    def combine_confs_lhs(self, x_a, x_b):
+        """
+        Combine x_a and x_b conformations for lambda=0
+        """
+        # place b first, then a overrides b
+        assert x_a.shape == (self.mol_a.GetNumAtoms(), 3)
+        assert x_b.shape == (self.mol_b.GetNumAtoms(), 3)
+        x0 = np.zeros((self.get_num_atoms(), 3))
+        for src, dst in enumerate(self.b_to_c):
+            x0[dst] = x_b[src]
+        for src, dst in enumerate(self.a_to_c):
+            x0[dst] = x_a[src]
+
+        return x0
+
 
 _Bonded = TypeVar("_Bonded", bound=Union[ChiralAtomRestraint, HarmonicAngleStable, HarmonicBond, PeriodicTorsion])
 
@@ -1313,7 +1431,7 @@ def check_chiral_validity(
 
 
 class SingleTopology(AtomMapMixin):
-    def __init__(self, mol_a, mol_b, core, forcefield):
+    def __init__(self, mol_a, mol_b, core, forcefield, use_smoothcore=False):
         """
         SingleTopology combines two molecules through a common core. The combined mol has
         atom indices laid out such that mol_a is identically mapped to the combined mol indices.
@@ -1365,6 +1483,25 @@ def __init__(self, mol_a, mol_b, core, forcefield):
             self.dst_system.bond.potential.idxs,
         )
 
+        self.use_smoothcore = use_smoothcore
+
+    def get_smoothcore_ligand_anchor_idxs(self):
+        ligand_anchor_idxs = np.zeros(self.get_num_atoms(), dtype=np.int32)
+
+        for c_idx, c_flag in enumerate(self.c_flags):
+            if c_flag == AtomMapFlags.CORE:
+                ligand_anchor_idxs[c_idx] = c_idx
+
+        for anchor, dummy_group in self.dummy_groups_ab.items():
+            for dummy_atom in dummy_group:
+                ligand_anchor_idxs[self.b_to_c[dummy_atom]] = self.b_to_c[anchor]
+
+        for anchor, dummy_group in self.dummy_groups_ba.items():
+            for dummy_atom in dummy_group:
+                ligand_anchor_idxs[self.a_to_c[dummy_atom]] = self.a_to_c[anchor]
+
+        return ligand_anchor_idxs
+
     def combine_masses(self, use_hmr=False):
         """
         Combine masses between two end-states by taking the heavier of the two core atoms.
@@ -1408,65 +1545,6 @@ def combine_masses(self, use_hmr=False):
 
         return mol_c_masses
 
-    def combine_confs(self, x_a, x_b, lamb=1.0):
-        """
-        Combine conformations of two molecules.
-
-        TODO: interpolate confs based on the lambda value?
-
-        Parameters
-        ----------
-        x_a: np.array of shape (N_A,3)
-            First conformation
-
-        x_b: np.array of shape (N_B,3)
-            Second conformation
-
-        lamb: optional float
-            if lamb > 0.5, map atoms from x_a first, then overwrite with x_b,
-            otherwise use opposite order
-
-        Returns
-        -------
-        np.array of shape (self.num_atoms,3)
-            Combined conformation
-
-        """
-        if lamb < 0.5:
-            return self.combine_confs_lhs(x_a, x_b)
-        else:
-            return self.combine_confs_rhs(x_a, x_b)
-
-    def combine_confs_rhs(self, x_a, x_b):
-        """
-        Combine x_a and x_b conformations for lambda=1
-        """
-        # place a first, then b overrides a
-        assert x_a.shape == (self.mol_a.GetNumAtoms(), 3)
-        assert x_b.shape == (self.mol_b.GetNumAtoms(), 3)
-        x0 = np.zeros((self.get_num_atoms(), 3))
-        for src, dst in enumerate(self.a_to_c):
-            x0[dst] = x_a[src]
-        for src, dst in enumerate(self.b_to_c):
-            x0[dst] = x_b[src]
-
-        return x0
-
-    def combine_confs_lhs(self, x_a, x_b):
-        """
-        Combine x_a and x_b conformations for lambda=0
-        """
-        # place b first, then a overrides b
-        assert x_a.shape == (self.mol_a.GetNumAtoms(), 3)
-        assert x_b.shape == (self.mol_b.GetNumAtoms(), 3)
-        x0 = np.zeros((self.get_num_atoms(), 3))
-        for src, dst in enumerate(self.b_to_c):
-            x0[dst] = x_b[src]
-        for src, dst in enumerate(self.a_to_c):
-            x0[dst] = x_a[src]
-
-        return x0
-
     def _setup_end_state_src(self):
         """
         Setup the source end-state, where mol_a is fully interacting, with mol_b's dummy atoms attached
@@ -1777,6 +1855,34 @@ def mol(self, lamb, min_bond_k=100.0) -> Chem.Mol:
         # make read-only
         return Chem.Mol(mol)
 
+    def _get_smoothcore_guest_params(self, q_handle, lj_handle, lamb: float):
+        mol_a_charges = q_handle.parameterize(self.mol_a)
+        mol_b_charges = q_handle.parameterize(self.mol_b)
+        mol_a_lj = lj_handle.parameterize(self.mol_a)
+        mol_b_lj = lj_handle.parameterize(self.mol_b)
+        mol_a_idxs = self.mol_a_idxs()
+        mol_b_idxs = self.mol_b_idxs()
+
+        charges_src = np.zeros(self.get_num_atoms())
+        charges_dst = np.zeros(self.get_num_atoms())
+        charges_src[mol_a_idxs] = mol_a_charges[np.array([self.c_to_a[x] for x in mol_a_idxs])]
+        charges_dst[mol_b_idxs] = mol_b_charges[np.array([self.c_to_b[x] for x in mol_b_idxs])]
+
+        lj_src = np.zeros((self.get_num_atoms(), 2))
+        lj_dst = np.zeros((self.get_num_atoms(), 2))
+        lj_src[mol_a_idxs] = mol_a_lj[np.array([self.c_to_a[x] for x in mol_a_idxs])]
+        lj_dst[mol_b_idxs] = mol_b_lj[np.array([self.c_to_b[x] for x in mol_b_idxs])]
+
+        from timemachine.potentials.nonbonded import smoothcore_ligand_param_interpolation
+
+        np.testing.assert_almost_equal(np.sum(charges_src), np.sum(charges_dst), decimal=5)
+
+        ligand_params = smoothcore_ligand_param_interpolation(lamb, charges_src, charges_dst, lj_src, lj_dst, self)
+
+        np.testing.assert_almost_equal(np.sum(charges_src), np.sum(ligand_params[:, 0]), decimal=5)
+
+        return ligand_params
+
     def _get_guest_params(self, q_handle, lj_handle, lamb: float, cutoff: float) -> jax.Array:
         """
         Return an array containing the guest_charges, guest_sigmas, guest_epsilons, guest_w_coords
@@ -1861,16 +1967,14 @@ def _parameterize_host_nonbonded(self, host_nonbonded: BoundPotential[Nonbonded]
 
         return combined_nonbonded.bind(hg_nb_params)
 
-    def _parameterize_host_guest_nonbonded_ixn(
+    def _parameterize_unbound_host_guest_nonbonded_ixn(
         self, lamb, host_nonbonded: BoundPotential[Nonbonded], num_water_atoms: int
-    ) -> BoundPotential[NonbondedInteractionGroup]:
+    ) -> tuple:
         """Parameterize nonbonded interactions between the host and guest"""
         num_host_atoms = host_nonbonded.params.shape[0]
         num_guest_atoms = self.get_num_atoms()
         cutoff = host_nonbonded.potential.cutoff
 
-        guest_ixn_env_params = self._get_guest_params(self.ff.q_handle, self.ff.lj_handle, lamb, cutoff)
-
         # L-W terms
         num_other_atoms = num_host_atoms - num_water_atoms
 
@@ -1886,17 +1990,39 @@ def get_other_idxs():
         def get_env_idxs():
             return np.array(list(get_other_idxs()) + list(get_water_idxs()), dtype=np.int32)
 
-        ixn_pot, ixn_params = get_ligand_ixn_pots_params(
-            get_lig_idxs(),
-            get_env_idxs(),
-            host_nonbonded.params,
-            guest_ixn_env_params,
-            beta=host_nonbonded.potential.beta,
-            cutoff=cutoff,
-        )
+        from timemachine.fe.topology import get_smoothcore_ligand_ixn_pots_params
+
+        if self.use_smoothcore:
+            print("WARNING: Using smoothcore nonbonded interactions...")
+            guest_ixn_env_params = self._get_smoothcore_guest_params(self.ff.q_handle, self.ff.lj_handle, lamb)
+            ixn_pot, ixn_params = get_smoothcore_ligand_ixn_pots_params(
+                get_lig_idxs(),
+                self.get_smoothcore_ligand_anchor_idxs(),
+                get_env_idxs(),
+                host_nonbonded.params,
+                guest_ixn_env_params,
+                beta=host_nonbonded.potential.beta,
+                cutoff=cutoff,
+            )
+        else:
+            guest_ixn_env_params = self._get_guest_params(self.ff.q_handle, self.ff.lj_handle, lamb, cutoff)
+            ixn_pot, ixn_params = get_ligand_ixn_pots_params(
+                get_lig_idxs(),
+                get_env_idxs(),
+                host_nonbonded.params,
+                guest_ixn_env_params,
+                beta=host_nonbonded.potential.beta,
+                cutoff=cutoff,
+            )
+
+        return ixn_pot, ixn_params
+
+    def _parameterize_host_guest_nonbonded_ixn(
+        self, lamb, host_nonbonded: BoundPotential[Nonbonded], num_water_atoms: int
+    ):
+        ixn_pot, ixn_params = self._parameterize_unbound_host_guest_nonbonded_ixn(lamb, host_nonbonded, num_water_atoms)
 
-        bound_ixn_pot = ixn_pot.bind(ixn_params)
-        return bound_ixn_pot
+        return ixn_pot.bind(ixn_params)
 
     def combine_with_host(self, host_system: VacuumSystem, lamb: float, num_water_atoms: int) -> HostGuestSystem:
         """
@@ -1915,7 +2041,7 @@ def combine_with_host(self, host_system: VacuumSystem, lamb: float, num_water_at
         Parameters
         ----------
         host_system: VacuumSystem
-            Parameterized system of the host
+            Parameterized host system.
 
         lamb: float
             Which lambda value we want to generate the combined system.
diff --git a/timemachine/fe/topology.py b/timemachine/fe/topology.py
index 23c4725686..8702935acb 100644
--- a/timemachine/fe/topology.py
+++ b/timemachine/fe/topology.py
@@ -696,7 +696,7 @@ def get_ligand_ixn_pots_params(
     guest_params_ixn_env: Params,
     beta=2.0,
     cutoff=1.2,
-) -> Tuple[potentials.NonbondedInteractionGroup, Params]:
+):
     """
     Return the interaction group potentials and corresponding parameters
     for the ligand-water and ligand-protein interaction terms.
@@ -725,7 +725,7 @@ def get_ligand_ixn_pots_params(
     num_lig_atoms = len(lig_idxs)
     num_total_atoms = num_lig_atoms + len(env_idxs)
 
-    hg_ixn_pot = potentials.NonbondedInteractionGroup(
+    ixn_pot = potentials.NonbondedInteractionGroup(
         num_total_atoms,
         lig_idxs,
         beta,
@@ -734,4 +734,63 @@ def get_ligand_ixn_pots_params(
     )
 
     hg_ixn_params = jnp.concatenate([host_nb_params, guest_params_ixn_env])
-    return hg_ixn_pot, hg_ixn_params
+
+    return ixn_pot, hg_ixn_params
+
+
+def get_smoothcore_ligand_ixn_pots_params(
+    lig_idxs: NDArray,
+    lig_anchor_idxs: NDArray,
+    env_idxs: Optional[NDArray],
+    host_nb_params: Params,
+    guest_params_ixn_env: Params,
+    beta=2.0,
+    cutoff=1.2,
+):
+    """
+    Return the interaction group potentials and corresponding parameters
+    for the ligand-water and ligand-protein interaction terms.
+
+    Parameters
+    ----------
+    lig_idxs:
+        List of ligand indexes (dtype, np.int32), one for each ligand.
+
+    lig_anchor_idxs:
+        List of ligand anchor indices denoting the origin for each atom.
+
+    env_idxs:
+        Indexes for the environment atoms including waters.
+        May be None if there are no other atoms.
+
+    host_nb_params:
+        Nonbonded parameters for the host (environment) atoms.
+
+    guest_params_ixn_env:
+        Parameters for the guest (ligand) NB interactions with the
+        environment atoms.
+    """
+
+    # Init
+    env_idxs = env_idxs if env_idxs is not None else np.array([])
+
+    # Ligand-Env terms
+    num_lig_atoms = len(lig_idxs)
+    num_host_atoms = len(env_idxs)
+    num_total_atoms = num_lig_atoms + len(env_idxs)
+
+    lig_anchor_idxs += num_host_atoms
+    anchor_idxs = np.concatenate([np.arange(num_host_atoms), lig_anchor_idxs])
+
+    ixn_pot = potentials.SmoothcoreNonbondedInteractionGroup(
+        num_total_atoms,
+        lig_idxs.astype(np.int32),
+        anchor_idxs.astype(np.int32),
+        beta,
+        cutoff,
+        col_atom_idxs=env_idxs.astype(np.int32),
+    )
+
+    hg_ixn_params = jnp.concatenate([host_nb_params, guest_params_ixn_env])
+
+    return ixn_pot, hg_ixn_params
diff --git a/timemachine/ff/handlers/nonbonded.py b/timemachine/ff/handlers/nonbonded.py
index 37ccc91ee2..826c83ad7d 100644
--- a/timemachine/ff/handlers/nonbonded.py
+++ b/timemachine/ff/handlers/nonbonded.py
@@ -698,6 +698,7 @@ def static_parameterize(params, smirks, mol):
         assert q_params.shape[0] == mol.GetNumAtoms()  # check that return shape is consistent with input mol
 
         return q_params
+        # return jnp.zeros_like(q_params)
 
 
 class AM1CCCIntraHandler(AM1CCCHandler):
diff --git a/timemachine/lib/custom_ops.pyi b/timemachine/lib/custom_ops.pyi
index f91fabf372..aae4c00524 100644
--- a/timemachine/lib/custom_ops.pyi
+++ b/timemachine/lib/custom_ops.pyi
@@ -240,6 +240,14 @@ class SegmentedWeightedRandomSampler_f64:
     def __init__(self, max_vals_per_segment: int, segments: int, seed: int) -> None: ...
     def sample(self, weights: List[List[float]]) -> List[int]: ...
 
+class SmoothcoreNonbondedInteractionGroup_f32(Potential):
+    def __init__(self, num_atoms: int, row_atom_idxs_i: numpy.typing.NDArray[numpy.int32], anchor_idxs_i: numpy.typing.NDArray[numpy.int32], beta: float, cutoff: float, col_atom_idxs_i: Optional[numpy.typing.NDArray[numpy.int32]] = ..., disable_hilbert_sort: bool = ..., nblist_padding: float = ...) -> None: ...
+    def set_atom_idxs(self, row_atom_idxs: List[int], col_atom_idxs: List[int]) -> None: ...
+
+class SmoothcoreNonbondedInteractionGroup_f64(Potential):
+    def __init__(self, num_atoms: int, row_atom_idxs_i: numpy.typing.NDArray[numpy.int32], anchor_idxs_i: numpy.typing.NDArray[numpy.int32], beta: float, cutoff: float, col_atom_idxs_i: Optional[numpy.typing.NDArray[numpy.int32]] = ..., disable_hilbert_sort: bool = ..., nblist_padding: float = ...) -> None: ...
+    def set_atom_idxs(self, row_atom_idxs: List[int], col_atom_idxs: List[int]) -> None: ...
+
 class SummedPotential(Potential):
     def __init__(self, potentials: List[Potential], params_sizes: List[int], parallel: bool = ...) -> None: ...
     def get_potentials(self) -> List[Potential]: ...
diff --git a/timemachine/potentials/__init__.py b/timemachine/potentials/__init__.py
index f19ad256a4..40e5973560 100644
--- a/timemachine/potentials/__init__.py
+++ b/timemachine/potentials/__init__.py
@@ -16,6 +16,7 @@
     NonbondedPairList,
     NonbondedPairListPrecomputed,
     PeriodicTorsion,
+    SmoothcoreNonbondedInteractionGroup,
     SummedPotential,
     make_summed_potential,
 )
@@ -36,6 +37,7 @@
     "Nonbonded",
     "NonbondedAllPairs",
     "NonbondedInteractionGroup",
+    "SmoothcoreNonbondedInteractionGroup",
     "NonbondedPairList",
     "NonbondedExclusions",
     "NonbondedPairListPrecomputed",
diff --git a/timemachine/potentials/nonbonded.py b/timemachine/potentials/nonbonded.py
index c32b7fcaba..58363870f7 100644
--- a/timemachine/potentials/nonbonded.py
+++ b/timemachine/potentials/nonbonded.py
@@ -126,8 +126,8 @@ def nonbonded_block_unsummed(
 
     dij = delta_r(ri, rj, box)
     dij = jnp.concatenate([dij, (w_i - w_j).reshape(*dij.shape[:-1], 1)], axis=-1)
-
     dij = jnp.linalg.norm(dij, axis=-1)
+
     sig_i = jnp.expand_dims(params_i[:, 1], axis=1)
     sig_j = jnp.expand_dims(params_j[:, 1], axis=0)
     eps_i = jnp.expand_dims(params_i[:, 2], axis=1)
@@ -143,7 +143,7 @@ def nonbonded_block_unsummed(
 
     es = switched_direct_space_pme(dij, qij, beta, cutoff)
     lj = lennard_jones(dij, sig_ij, eps_ij)
-
+    lj = jnp.where(eps_ij != 0, lj, 0)  # needed for self consistent du_deps w.r.t. GPU
     nrgs = jnp.where(dij < cutoff, es + lj, 0)
     return cast(Array, nrgs)
 
@@ -155,6 +155,56 @@ def nonbonded_block(xi, xj, box, params_i, params_j, beta, cutoff):
     return jnp.sum(nonbonded_block_unsummed(xi, xj, box, params_i, params_j, beta, cutoff))
 
 
+import itertools
+
+
+def nonbonded_block_rescaled(xi, xj, box, params_i, params_j, beta, cutoff, anchors_i, groups_i):
+    """
+
+    Soft-core variation where w_i is applied by means of rescaling dummy atoms relative to an
+    arbitrary choice of the anchoring atom. If w_i is 0, then atom i is fully interacting.
+    If w_i is 1, then atom_i is fully non-interacting. If 0 < w_i < 1 then atom_i is scaled
+    relative to the location of the anchoring atom for the group that atom_i belongs to. Atoms
+    that are not part of the groups_i ignore their w_i values.
+
+    Note that we assume that charges have *already* been balanced correctly in the upstream
+    interpolation code.
+
+    Example:
+
+    anchors_i = [0, 5, 9]
+    groups_i = [[3,4], [7], [6,8,2]]
+    w_i = [[0.2, 0.2], [0.8], [0.2, 0.2]]
+
+    """
+
+    xi = jnp.array(xi)
+    params_i = jnp.array(params_i)
+
+    assert len(anchors_i) == len(groups_i)
+
+    # transform xi
+    xi_a = xi[np.repeat(anchors_i, [len(x) for x in groups_i])]
+
+    group_idxs = jnp.array(list(itertools.chain(*groups_i)))
+
+    excluded_idxs = list(set(range(len(xi))).difference(group_idxs.tolist()))
+    excluded_idxs = jnp.array(excluded_idxs)
+
+    np.testing.assert_array_equal(params_i[excluded_idxs, -1], 0)
+
+    xi_g = xi[group_idxs]
+    scale_i = jnp.expand_dims(1 - params_i[:, -1], axis=-1)[group_idxs]
+    # compute the displacement vector
+    r_ga = scale_i * (xi_g - xi_a)
+    xi = xi.at[group_idxs].set(xi_g + r_ga)
+
+    # set q,s,e,w all to zero if scale_i is exactly zero.
+    params_i = params_i.at[jnp.argwhere(scale_i <= 0)].set(0)
+
+    return nonbonded_block(xi, xj, box, params_i, params_j, beta, cutoff)
+
+
 def convert_exclusions_to_rescale_masks(exclusion_idxs, scales, N):
     """Converts exclusions from list format used in Nonbonded to mask format used in `nonbonded`"""
 
@@ -881,3 +931,104 @@ def lj_interaction_group_energy(sig_ligand, eps_ligand, lj_prefactors):
 
     projection = vmap(basis_expand_lj_atom)(sig_ligand, eps_ligand)
     return jnp.sum(projection * lj_prefactors)
+
+
+def smoothcore_charge_interpolation(lamb, charges_src, charges_dst, mol_a_idxs, mol_b_idxs):
+    # absorb then shrink
+
+    net_charge_src = np.sum(charges_src)
+    net_charge_dst = np.sum(charges_dst)
+
+    np.testing.assert_almost_equal(net_charge_src, net_charge_dst, decimal=5)
+    charges_interpolated = np.zeros_like(charges_src)
+
+    charges_interpolated[mol_a_idxs] += (1 - lamb) * charges_src[mol_a_idxs]
+    charges_interpolated[mol_b_idxs] += lamb * charges_dst[mol_b_idxs]
+
+    np.testing.assert_almost_equal(np.sum(charges_interpolated), np.sum(net_charge_src), decimal=5)
+
+    return charges_interpolated
+
+
+# (ytz): guesstimate of the minimal numerically stable eps/sig pair that is 1-step removable
+# MIN_SMOOTHCORE_SIG_HALF = 0.05
+# MIN_SMOOTHCORE_EPS_SQRT = 0.10
+
+MIN_SMOOTHCORE_SIG_HALF = 0.0  # additive combining rule
+MIN_SMOOTHCORE_EPS_SQRT = 0.0  # multiplicate combining rule
+
+
+def smoothcore_lj_interpolation(lamb, lj_src, lj_dst, mol_a_idxs, mol_b_idxs):
+    MIN_SIG_EPS = np.where(lamb > 0 and lamb < 1, [MIN_SMOOTHCORE_SIG_HALF, MIN_SMOOTHCORE_EPS_SQRT], [0.0, 0.0])
+    # core atoms are interpolated linearly
+    # dummy atoms [sig,eps] are scaled down to [min(anchor_sig, dummy_sig), min(anchor_eps, dummy_eps)] except
+    # at the end-points, where they're both zero.
+    lj_src_stable = np.repeat([MIN_SIG_EPS], len(lj_src), axis=0)
+    lj_dst_stable = np.repeat([MIN_SIG_EPS], len(lj_dst), axis=0)
+
+    lj_src_stable[mol_a_idxs] = lj_src[mol_a_idxs]
+    lj_dst_stable[mol_b_idxs] = lj_dst[mol_b_idxs]
+    lj_interpolated = np.repeat([MIN_SIG_EPS], len(lj_src), axis=0)
+
+    lj_interpolated[mol_a_idxs] += np.clip((1 - lamb) * 10, 0, 1) * lj_src[mol_a_idxs]
+    lj_interpolated[mol_b_idxs] += np.clip(lamb * 10, 0, 1) * lj_dst[mol_b_idxs]
+
+    return lj_interpolated
+
+
+def smoothcore_ligand_param_interpolation(lamb, charges_src, charges_dst, lj_src, lj_dst, amm):
+    # When turning the dummy atoms, first turn on the parameters to full strength while in a shrunken state
+    # Then expand out the dummy atoms slowly
+    q_lamb = lamb
+    s_lamb = lamb
+    w_lamb = lamb
+    # p_lamb = np.clip(2 * lamb, 0, 1)
+    # w_lamb = np.clip((lamb - 0.5)*2, 0, 1)
+
+    mol_a_idxs = amm.mol_a_idxs()
+    mol_b_idxs = amm.mol_b_idxs()
+    dummy_a_idxs = amm.dummy_a_idxs()
+    dummy_b_idxs = amm.dummy_b_idxs()
+
+    charges = smoothcore_charge_interpolation(q_lamb, charges_src, charges_dst, mol_a_idxs, mol_b_idxs)
+    charges = charges.reshape((-1, 1))
+    sig_eps = smoothcore_lj_interpolation(s_lamb, lj_src, lj_dst, mol_a_idxs, mol_b_idxs)
+    w_offsets = np.zeros(amm.get_num_atoms())
+    w_offsets[dummy_a_idxs] = w_lamb
+    w_offsets[dummy_b_idxs] = 1 - w_lamb
+    w_offsets = w_offsets.reshape((-1, 1))
+    ligand_params = np.hstack([charges, sig_eps, w_offsets])
+
+    return ligand_params
+
+
+def smoothcore_nonbonded_interaction_group(conf, params, box, row_atom_idxs, col_atom_idxs, anchor_idxs, beta, cutoff):
+    conf = jnp.array(conf)
+    params = jnp.array(params)
+
+    anchor_confs = conf[anchor_idxs]
+    rescale_factors = jnp.expand_dims(1 - params[:, -1], axis=-1)
+    conf = anchor_confs + rescale_factors * (conf - anchor_confs)
+
+    row_xs = conf[row_atom_idxs]
+    col_xs = conf[col_atom_idxs]
+
+    # reset w_coords
+    params = params.at[:, -1].set(0)
+    row_params = jnp.array(params[row_atom_idxs])
+    col_params = jnp.array(params[col_atom_idxs])
+
+    return nonbonded_block(row_xs, col_xs, box, row_params, col_params, beta, cutoff)
+
+
+def reference_nonbonded_interaction_group(conf, params, box, row_atom_idxs, col_atom_idxs, beta, cutoff):
+    conf = jnp.array(conf)
+    params = jnp.array(params)
+
+    row_params = jnp.array(params[row_atom_idxs])
+    col_params = jnp.array(params[col_atom_idxs])
+
+    row_xs = conf[row_atom_idxs]
+    col_xs = conf[col_atom_idxs]
+
+    return nonbonded_block(row_xs, col_xs, box, row_params, col_params, beta, cutoff)
diff --git a/timemachine/potentials/potentials.py b/timemachine/potentials/potentials.py
index 6e7a2af5bf..78e15048a9 100644
--- a/timemachine/potentials/potentials.py
+++ b/timemachine/potentials/potentials.py
@@ -163,8 +163,6 @@ class NonbondedInteractionGroup(Potential):
     nblist_padding: float = 0.1
 
     def __call__(self, conf: Conf, params: Params, box: Optional[Box]) -> float | Array:
-        num_atoms, _ = jnp.array(conf).shape
-
         vdW, electrostatics = nonbonded.nonbonded_interaction_groups(
             conf,
             params,
@@ -177,6 +175,30 @@ def __call__(self, conf: Conf, params: Params, box: Optional[Box]) -> float | Ar
         return jnp.sum(vdW) + jnp.sum(electrostatics)
 
 
+@dataclass
+class SmoothcoreNonbondedInteractionGroup(Potential):
+    num_atoms: int
+    row_atom_idxs: NDArray[np.int32]
+    anchor_idxs: NDArray[np.int32]
+    beta: float
+    cutoff: float
+    col_atom_idxs: Optional[NDArray[np.int32]] = None
+    disable_hilbert_sort: bool = False
+    nblist_padding: float = 0.1
+
+    def __call__(self, conf: Conf, params: Params, box: Optional[Box]) -> float | Array:
+        return nonbonded.smoothcore_nonbonded_interaction_group(
+            conf,
+            params,
+            box,
+            self.row_atom_idxs,
+            self.col_atom_idxs,
+            self.anchor_idxs,
+            self.beta,
+            self.cutoff,
+        )
+
+
 @dataclass
 class NonbondedPairList(Potential):
     idxs: NDArray[np.int32]
diff --git a/timemachine/testsystems/data/benzyl.sdf b/timemachine/testsystems/data/benzyl.sdf
new file mode 100644
index 0000000000..dda43f1056
--- /dev/null
+++ b/timemachine/testsystems/data/benzyl.sdf
@@ -0,0 +1,274 @@
+benzene
+                    3D
+ Structure written by MMmdl.
+ 12 12  0  0  1  0            999 V2000
+   -0.0059    0.0056   -0.0624 C   0  0  0  0  0  0
+    0.0329   -0.6666   -1.2782 C   0  0  0  0  0  0
+    0.0664   -2.0558   -1.3034 C   0  0  0  0  0  0
+    0.0612   -2.7728   -0.1128 C   0  0  0  0  0  0
+    0.0224   -2.1005    1.1030 C   0  0  0  0  0  0
+   -0.0111   -0.7114    1.1282 C   0  0  0  0  0  0
+   -0.0320    1.0872   -0.0428 H   0  0  0  0  0  0
+    0.0370   -0.1084   -2.2052 H   0  0  0  0  0  0
+    0.0966   -2.5792   -2.2500 H   0  0  0  0  0  0
+    0.0873   -3.8543   -0.1324 H   0  0  0  0  0  0
+    0.0183   -2.6587    2.0300 H   0  0  0  0  0  0
+   -0.0413   -0.1880    2.0748 H   0  0  0  0  0  0
+  1  2  1  0  0  0
+  1  6  2  0  0  0
+  1  7  1  0  0  0
+  2  3  2  0  0  0
+  2  8  1  0  0  0
+  3  4  1  0  0  0
+  3  9  1  0  0  0
+  4  5  2  0  0  0
+  4 10  1  0  0  0
+  5  6  1  0  0  0
+  5 11  1  0  0  0
+  6 12  1  0  0  0
+M  END
+> <s_m_entry_id>
+1
+
+> <s_m_entry_name>
+entry
+
+> <r_f3d_energy>
+0
+
+> <i_f3d_flags>
+0
+
+$$$$
+phenol
+                    3D
+ Structure written by MMmdl.
+ 13 13  0  0  1  0            999 V2000
+   -0.0059    0.0056   -0.0624 C   0  0  0  0  0  0
+    0.0329   -0.6666   -1.2782 C   0  0  0  0  0  0
+    0.0664   -2.0558   -1.3034 C   0  0  0  0  0  0
+    0.0612   -2.7728   -0.1128 C   0  0  0  0  0  0
+    0.0224   -2.1005    1.1030 C   0  0  0  0  0  0
+   -0.0111   -0.7114    1.1282 C   0  0  0  0  0  0
+   -0.0320    1.0872   -0.0428 H   0  0  0  0  0  0
+    0.0370   -0.1084   -2.2052 H   0  0  0  0  0  0
+    0.0966   -2.5792   -2.2500 H   0  0  0  0  0  0
+    0.0873   -3.8543   -0.1324 H   0  0  0  0  0  0
+    0.0183   -2.6587    2.0300 H   0  0  0  0  0  0
+   -0.0492   -0.0516    2.3214 O   0  0  0  0  0  0
+   -0.0475   -0.6884    3.0399 H   0  0  0  0  0  0
+  1  2  1  0  0  0
+  1  6  2  0  0  0
+  1  7  1  0  0  0
+  2  3  2  0  0  0
+  2  8  1  0  0  0
+  3  4  1  0  0  0
+  3  9  1  0  0  0
+  4  5  2  0  0  0
+  4 10  1  0  0  0
+  5  6  1  0  0  0
+  5 11  1  0  0  0
+  6 12  1  0  0  0
+ 12 13  1  0  0  0
+M  END
+> <s_m_entry_id>
+2
+
+> <s_m_entry_name>
+entry
+
+> <r_f3d_energy>
+0
+
+> <i_f3d_flags>
+0
+
+$$$$
+benzene_t_butyl
+                    3D
+ Structure written by MMmdl.
+ 24 24  0  0  1  0            999 V2000
+   -0.0059    0.0056   -0.0624 C   0  0  0  0  0  0
+    0.0329   -0.6666   -1.2782 C   0  0  0  0  0  0
+    0.0664   -2.0558   -1.3034 C   0  0  0  0  0  0
+    0.0612   -2.7728   -0.1128 C   0  0  0  0  0  0
+    0.0224   -2.1005    1.1030 C   0  0  0  0  0  0
+   -0.0111   -0.7114    1.1282 C   0  0  0  0  0  0
+   -0.0320    1.0872   -0.0428 H   0  0  0  0  0  0
+    0.0370   -0.1084   -2.2052 H   0  0  0  0  0  0
+    0.0966   -2.5792   -2.2500 H   0  0  0  0  0  0
+    0.0873   -3.8543   -0.1324 H   0  0  0  0  0  0
+    0.0183   -2.6587    2.0300 H   0  0  0  0  0  0
+   -0.0541    0.0335    2.4754 C   0  0  0  0  0  0
+   -0.0514   -0.9879    3.6279 C   0  0  0  0  0  0
+    1.1796    0.9471    2.5973 C   0  0  0  0  0  0
+   -1.3336    0.8873    2.5502 C   0  0  0  0  0  0
+    1.1492    1.4743    3.5508 H   0  0  0  0  0  0
+    2.0858    0.3437    2.5451 H   0  0  0  0  0  0
+    1.1783    1.6709    1.7823 H   0  0  0  0  0  0
+   -1.3640    1.4145    3.5037 H   0  0  0  0  0  0
+   -1.3362    1.6111    1.7352 H   0  0  0  0  0  0
+   -2.2075    0.2415    2.4646 H   0  0  0  0  0  0
+   -0.0818   -0.4607    4.5814 H   0  0  0  0  0  0
+   -0.9246   -1.6347    3.5429 H   0  0  0  0  0  0
+    0.8542   -1.5924    3.5762 H   0  0  0  0  0  0
+  1  2  1  0  0  0
+  1  6  2  0  0  0
+  1  7  1  0  0  0
+  2  3  2  0  0  0
+  2  8  1  0  0  0
+  3  4  1  0  0  0
+  3  9  1  0  0  0
+  4  5  2  0  0  0
+  4 10  1  0  0  0
+  5  6  1  0  0  0
+  5 11  1  0  0  0
+  6 12  1  0  0  0
+ 12 13  1  0  0  0
+ 12 14  1  0  0  0
+ 12 15  1  0  0  0
+ 13 22  1  0  0  0
+ 13 23  1  0  0  0
+ 13 24  1  0  0  0
+ 14 16  1  0  0  0
+ 14 17  1  0  0  0
+ 14 18  1  0  0  0
+ 15 19  1  0  0  0
+ 15 20  1  0  0  0
+ 15 21  1  0  0  0
+M  END
+> <s_m_entry_id>
+3
+
+> <s_m_entry_name>
+entry
+
+> <r_f3d_energy>
+0
+
+> <i_f3d_flags>
+0
+
+$$$$
+benzene_t_oh
+                    3D
+ Structure written by MMmdl.
+ 18 18  0  0  1  0            999 V2000
+   -0.0059    0.0056   -0.0624 C   0  0  0  0  0  0
+    0.0329   -0.6666   -1.2782 C   0  0  0  0  0  0
+    0.0664   -2.0558   -1.3034 C   0  0  0  0  0  0
+    0.0612   -2.7728   -0.1128 C   0  0  0  0  0  0
+    0.0224   -2.1005    1.1030 C   0  0  0  0  0  0
+   -0.0111   -0.7114    1.1282 C   0  0  0  0  0  0
+   -0.0320    1.0872   -0.0428 H   0  0  0  0  0  0
+    0.0370   -0.1084   -2.2052 H   0  0  0  0  0  0
+    0.0966   -2.5792   -2.2500 H   0  0  0  0  0  0
+    0.0873   -3.8543   -0.1324 H   0  0  0  0  0  0
+    0.0183   -2.6587    2.0300 H   0  0  0  0  0  0
+   -0.0541    0.0335    2.4754 C   0  0  0  0  0  0
+   -0.0516   -0.9017    3.5306 O   0  0  0  0  0  0
+    1.0755    0.8700    2.5870 O   0  0  0  0  0  0
+   -1.2256    0.8152    2.5439 O   0  0  0  0  0  0
+   -1.2524    1.2796    3.3837 H   0  0  0  0  0  0
+    1.0487    1.3343    3.4268 H   0  0  0  0  0  0
+   -0.0784   -0.4374    4.3704 H   0  0  0  0  0  0
+  1  2  1  0  0  0
+  1  6  2  0  0  0
+  1  7  1  0  0  0
+  2  3  2  0  0  0
+  2  8  1  0  0  0
+  3  4  1  0  0  0
+  3  9  1  0  0  0
+  4  5  2  0  0  0
+  4 10  1  0  0  0
+  5  6  1  0  0  0
+  5 11  1  0  0  0
+  6 12  1  0  0  0
+ 12 13  1  0  0  0
+ 12 14  1  0  0  0
+ 12 15  1  0  0  0
+ 13 18  1  0  0  0
+ 14 17  1  0  0  0
+ 15 16  1  0  0  0
+M  END
+> <s_m_entry_id>
+4
+
+> <s_m_entry_name>
+entry
+
+> <r_f3d_energy>
+0
+
+> <i_f3d_flags>
+0
+
+$$$$
+benzene_bicycle
+                    3D
+ Structure written by MMmdl.
+ 24 26  0  0  1  0            999 V2000
+   -0.0059    0.0055   -0.0625 C   0  0  0  0  0  0
+    0.0329   -0.6665   -1.2782 C   0  0  0  0  0  0
+    0.0664   -2.0560   -1.3034 C   0  0  0  0  0  0
+    0.0612   -2.7727   -0.1128 C   0  0  0  0  0  0
+    0.0224   -2.1008    1.1028 C   0  0  0  0  0  0
+   -0.0114   -0.7056    1.1357 C   0  0  0  0  0  0
+   -0.0320    1.0871   -0.0428 H   0  0  0  0  0  0
+    0.0370   -0.1085   -2.2050 H   0  0  0  0  0  0
+    0.0966   -2.5791   -2.2500 H   0  0  0  0  0  0
+    0.0873   -3.8540   -0.1325 H   0  0  0  0  0  0
+    0.0183   -2.6587    2.0299 H   0  0  0  0  0  0
+   -0.0527   -0.0266    2.4813 C   0  0  0  0  0  0
+   -0.0587   -0.6834    3.6300 N   0  0  0  0  0  0
+   -0.0982    0.2636    4.5389 C   0  0  0  0  0  0
+   -0.1176    1.5222    3.9853 C   0  0  0  0  0  0
+   -0.0879    1.3408    2.6283 O   0  0  0  0  0  0
+   -0.1263    0.2547    6.0480 C   0  0  0  0  0  0
+   -0.1631    2.6102    5.0358 C   0  0  0  0  0  0
+   -0.1684    1.7629    6.3928 C   0  0  0  0  0  0
+   -1.0125   -0.2801    6.4170 H   0  0  0  0  0  0
+    0.7698   -0.2379    6.4504 H   0  0  0  0  0  0
+   -0.2062    2.4033    7.5965 O   0  0  0  0  0  0
+   -0.1821    3.6481    4.7378 H   0  0  0  0  0  0
+   -0.2260    3.3525    7.4541 H   0  0  0  0  0  0
+  1  2  1  0  0  0
+  1  6  2  0  0  0
+  1  7  1  0  0  0
+  2  3  2  0  0  0
+  2  8  1  0  0  0
+  3  4  1  0  0  0
+  3  9  1  0  0  0
+  4  5  2  0  0  0
+  4 10  1  0  0  0
+  5  6  1  0  0  0
+  5 11  1  0  0  0
+  6 12  1  0  0  0
+ 12 13  2  0  0  0
+ 12 16  1  0  0  0
+ 13 14  1  0  0  0
+ 14 15  2  0  0  0
+ 14 17  1  0  0  0
+ 15 16  1  0  0  0
+ 15 18  1  0  0  0
+ 17 19  1  0  0  0
+ 17 20  1  0  0  0
+ 17 21  1  0  0  0
+ 18 19  2  0  0  0
+ 18 23  1  0  0  0
+ 19 22  1  0  0  0
+ 22 24  1  0  0  0
+M  END
+> <s_m_entry_id>
+5
+
+> <s_m_entry_name>
+entry
+
+> <r_f3d_energy>
+0
+
+> <i_f3d_flags>
+0
+
+$$$$