Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: change StudyLocusId hashing method to md5 (and change StudyLocusId to string type) #783

Merged
merged 19 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
4389315
feat: change studyLocusId to string in schema
vivienho Sep 24, 2024
7dc153a
feat: change studyLocusId of example data to string in tests
vivienho Sep 24, 2024
7e62efd
feat: change hashing method to md5
vivienho Sep 24, 2024
dd354b4
test: remove test_assign_study_locus_id__null_variant_id as validatio…
vivienho Sep 24, 2024
4c7e146
fix: change studyLocusId to string in remaining files
vivienho Sep 24, 2024
58ca683
chore: resolve merge conflicts
vivienho Sep 24, 2024
1dec962
fix: ensure inputs to assign_study_locus_id are columns and not strings
vivienho Sep 24, 2024
bcae23d
fix: change studyLocusId to string in remaining files
vivienho Sep 24, 2024
d3c122d
Merge branch 'dev' into vh-3448
vivienho Sep 24, 2024
8057a55
chore: update assign_study_locus_id docstring with updated output
vivienho Sep 24, 2024
78ab0c0
Merge branch 'vh-3448' of https://github.com/opentargets/gentropy int…
vivienho Sep 24, 2024
d8ab719
chore: update assign_study_locus_id docstring with updated output (ag…
vivienho Sep 24, 2024
29ef81a
Merge branch 'dev' into vh-3448
vivienho Sep 24, 2024
e873353
fix: change studyLocusId to string in recently merged files
vivienho Sep 24, 2024
ce125f9
Merge branch 'dev' into vh-3448
DSuveges Sep 25, 2024
f1b0817
feat: move hashing logic to generate_identifier function in Dataset c…
vivienho Sep 26, 2024
c441b79
Merge branch 'dev' into vh-3448
DSuveges Sep 26, 2024
caea96e
Merge branch 'dev' into vh-3448
DSuveges Sep 27, 2024
bd0ed41
Merge branch 'dev' into vh-3448
DSuveges Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/gentropy/assets/schemas/colocalisation.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
{
"name": "leftStudyLocusId",
"nullable": false,
"type": "long",
"type": "string",
"metadata": {}
},
{
"name": "rightStudyLocusId",
"nullable": false,
"type": "long",
"type": "string",
"metadata": {}
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_feature.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_predictions.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/study_locus.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"metadata": {},
"name": "studyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/assets/schemas/study_locus_overlap.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
"metadata": {},
"name": "leftStudyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
"name": "rightStudyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
Expand Down
15 changes: 15 additions & 0 deletions src/gentropy/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,18 @@ def flag_duplicates(test_column: Column) -> Column:
)
> 1
)

@staticmethod
def generate_identifier(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to generate a unique identifier.

Args:
uniqueness_defining_columns (list[str]): list of columns defining uniqueness

Returns:
Column: column with a unique identifier
"""
hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None"))
.otherwise(f.col(column).cast("string"))
for column in uniqueness_defining_columns]
return f.md5(f.concat(*hashable_columns))
34 changes: 12 additions & 22 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,38 +447,28 @@ def _align_overlapping_tags(
)

@staticmethod
def assign_study_locus_id(
study_id_col: Column,
variant_id_col: Column,
finemapping_col: Column = None,
) -> Column:
"""Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId.
def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to extract a consistent studyLocusId.

Args:
study_id_col (Column): column name with a study ID
variant_id_col (Column): column name with a variant ID
finemapping_col (Column, optional): column with fine mapping methodology
uniqueness_defining_columns (list[str]): list of columns defining uniqueness

Returns:
Column: column with a study locus ID

Examples:
>>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show()
+----------+----------+-----------------+-------------------+
| studyId| variantId|finemappingMethod| study_locus_id|
+----------+----------+-----------------+-------------------+
|GCST000001|1_1000_A_C| SuSiE-inf|3801266831619496075|
|GCST000002|1_1000_A_C| pics|1581844826999194430|
+----------+----------+-----------------+-------------------+
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False)
+----------+----------+-----------------+--------------------------------+
|studyId |variantId |finemappingMethod|study_locus_id |
+----------+----------+-----------------+--------------------------------+
|GCST000001|1_1000_A_C|SuSiE-inf |109804fe1e20c94231a31bafd71b566e|
|GCST000002|1_1000_A_C|pics |de310be4558e0482c9cc359c97d37773|
+----------+----------+-----------------+--------------------------------+
<BLANKLINE>
"""
if finemapping_col is None:
finemapping_col = f.lit(None).cast(StringType())
variant_id_col = f.coalesce(variant_id_col, f.rand().cast("string"))
return f.xxhash64(study_id_col, variant_id_col, finemapping_col).alias(
"studyLocusId"
)
return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId")


@classmethod
def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column:
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def from_susie_results(
.select(
*study_locus_cols,
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
StudyLocus.calculate_credible_set_log10bf(
f.col("locus.logBF")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/finngen/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def from_finngen_susie_finemapping(
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)

Expand Down
6 changes: 3 additions & 3 deletions src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import TYPE_CHECKING

import pyspark.sql.functions as f
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType
from pyspark.sql.types import DoubleType, FloatType, IntegerType, StringType
from pyspark.sql.window import Window

from gentropy.assets import data
Expand Down Expand Up @@ -1109,7 +1109,7 @@ def from_source(
"""
return StudyLocusGWASCatalog(
_df=gwas_associations.withColumn(
"studyLocusId", f.monotonically_increasing_id().cast(LongType())
"studyLocusId", f.monotonically_increasing_id().cast(StringType())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a dealbreaker, and has no impact whatsoever: this column is not a "real" studyLocusId: this column is temporarily generated to identify original rows of the GWAS Catalog association dataset before explosion. But it is fine.

)
.transform(
# Map/harmonise variants to variant annotation dataset:
Expand Down Expand Up @@ -1188,7 +1188,7 @@ def update_study_id(
.drop("subStudyDescription", "updatedStudyId")
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
return self

Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parse_positive_curation(
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
.groupBy("studyLocusId", "studyId", "variantId", "geneId")
.agg(f.collect_set("source").alias("sources"))
Expand Down
17 changes: 11 additions & 6 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
study_locus_overlap = StudyLocus(
_df=self.credible_set.df.join(
f.broadcast(
self.gs_curation.select(
StudyLocus.assign_study_locus_id(
f.col("association_info.otg_id"), # studyId
f.concat_ws( # variantId
self.gs_curation
.withColumn(
"variantId",
f.concat_ws(
"_",
f.col("sentinel_variant.locus_GRCh38.chromosome"),
f.col("sentinel_variant.locus_GRCh38.position"),
f.col("sentinel_variant.alleles.reference"),
f.col("sentinel_variant.alleles.alternative"),
),
).alias("studyLocusId"),
)
)
.select(
StudyLocus.assign_study_locus_id(
["association_info.otg_id", # studyId
"variantId"]
),
)
),
"studyLocusId",
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/method/locus_breaker_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def locus_breaker(
.cast(t.ArrayType(t.StringType()))
.alias("qualityControls"),
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
).alias("studyLocusId"),
["studyId", "variantId"]
),
)
),
_schema=StudyLocus.get_schema(),
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/pics.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def finemap(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
"studyId", "variantId", "finemappingMethod"
["studyId", "variantId", "finemappingMethod"]
),
)
.drop("neglog_pvalue")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def clump(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
["studyId", "variantId"]
),
)
# Initialize QC column as array of strings:
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/susie_finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(
.df.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
"studyId", "variantId", "finemappingMethod"
["studyId", "variantId", "finemappingMethod"]
),
)
.collect()[0]
Expand Down Expand Up @@ -247,7 +247,7 @@ def susie_inf_to_studylocus(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)
.select(
Expand Down
6 changes: 3 additions & 3 deletions tests/gentropy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,10 +617,10 @@ def mock_l2g_feature_matrix(spark: SparkSession) -> L2GFeatureMatrix:
return L2GFeatureMatrix(
_df=spark.createDataFrame(
[
(1, "gene1", 100.0, None),
(2, "gene2", 1000.0, 0.0),
("1", "gene1", 100.0, None),
("2", "gene2", 1000.0, 0.0),
],
"studyLocusId LONG, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
"studyLocusId STRING, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
),
with_gold_standard=False,
)
Expand Down
6 changes: 3 additions & 3 deletions tests/gentropy/dataset/test_colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
_df=spark.createDataFrame(
[
(
1,
"1",
"var1",
"gwas1",
),
(
2,
"2",
"var2",
"eqtl1",
),
Expand All @@ -100,7 +100,7 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
)
self.sample_colocalisation = Colocalisation(
_df=spark.createDataFrame(
[(1, 2, "eqtl", "X", "COLOC", 1, 0.9)],
[("1", "2", "eqtl", "X", "COLOC", 1, 0.9)],
[
"leftStudyLocusId",
"rightStudyLocusId",
Expand Down
Loading
Loading