From 4399b1c87d7a571e09a43fc0052e951ea6a1cc02 Mon Sep 17 00:00:00 2001 From: Tim Millar Date: Fri, 26 Sep 2025 20:28:03 +1200 Subject: [PATCH 1/2] Fix high memory usage bug with chunked HK relationship #1333 --- sgkit/stats/pedigree.py | 9 +++++++-- sgkit/tests/test_pedigree.py | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/pedigree.py b/sgkit/stats/pedigree.py index bff889407..3420ae13a 100644 --- a/sgkit/stats/pedigree.py +++ b/sgkit/stats/pedigree.py @@ -1198,10 +1198,15 @@ def pedigree_kinship( # new dataset dims = ["samples_0", "samples_1"] if return_relationship: - relationship = kinship * 2 if method == EST_HAMILTON_KERR: ploidy = tau.sum(axis=-1) - relationship *= np.sqrt(ploidy[None, :] / 2 * ploidy[:, None] / 2) + ploidy_0 = ploidy.rechunk((kinship.chunks[0],)) + ploidy_1 = ploidy.rechunk((kinship.chunks[1],)) + scale = da.sqrt(ploidy_0[:, None] * ploidy_1[None, :]) + assert kinship.chunks == scale.chunks + relationship = kinship * scale + else: + relationship = kinship * 2 arrays = { variables.stat_pedigree_kinship: xr.DataArray(kinship, dims=dims), variables.stat_pedigree_relationship: xr.DataArray(relationship, dims=dims), diff --git a/sgkit/tests/test_pedigree.py b/sgkit/tests/test_pedigree.py index 943db916d..42856b523 100644 --- a/sgkit/tests/test_pedigree.py +++ b/sgkit/tests/test_pedigree.py @@ -560,6 +560,29 @@ def test_pedigree_kinship__Hamilton_Kerr_compress_parent_dimension( np.testing.assert_array_almost_equal(actual, expect) +def test_pedigree_kinship__Hamilton_Kerr_chunked_mem(): + # tests fix for https://github.com/sgkit-dev/sgkit/issues/1333 + # if chunking is not applied to the ploidy matrix correctly + # this will try to allocate > 7TB of RAM (and presumably fail) + ds = xr.Dataset() + ds["parent"] = ["samples", "parents"], np.full((1_000_000, 2), -1, int) + ds["stat_Hamilton_Kerr_tau"] = ["samples", "parents"], np.full( + (1_000_000, 2), 2, np.uint64 + ) + ds["stat_Hamilton_Kerr_lambda"] = ["samples", "parents"], np.full( + (1_000_000, 2), 0.0 + ) + matrix = sg.pedigree_kinship( + ds, + method="Hamilton-Kerr", + chunks=((10, 999990), (20, 999980)), + return_relationship=True, + ).stat_pedigree_relationship + expect = np.eye(10, 20) + actual = matrix[0:10, 0:20].compute() + np.testing.assert_array_almost_equal(actual, expect) + + @pytest.mark.parametrize("method", ["diploid", "Hamilton-Kerr"]) def test_pedigree_kinship__half_founder(method): ds0 = sg.simulate_genotype_call_dataset(n_variant=1, n_sample=6) From b95b1bc042352e85d19ee96f6caa8afa900277ac Mon Sep 17 00:00:00 2001 From: Tim Millar Date: Fri, 26 Sep 2025 20:37:38 +1200 Subject: [PATCH 2/2] Update changelog --- docs/changelog.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index f4da563ee..91b574ec8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -28,8 +28,12 @@ Deprecations .. Improvements .. ~~~~~~~~~~~~ -.. Bug fixes -.. ~~~~~~~~~ +Bug fixes +~~~~~~~~~ + +- Fixed bug causing high memory usage in :func:`pedigree_kinship` when + evaluating a chunked relationship matrix with the Hamilton-Kerr method. + (:user:`timothymillar`, :pr:`1335`, :issue:`1333`) .. Documentation .. ~~~~~~~~~~~~~