Skip to content

Commit

Permalink
feat: move hashing logic to generate_identifier function in Dataset c…
Browse files Browse the repository at this point in the history
…lass
  • Loading branch information
vivienho committed Sep 26, 2024
1 parent ce125f9 commit f1b0817
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 35 deletions.
15 changes: 15 additions & 0 deletions src/gentropy/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,18 @@ def flag_duplicates(test_column: Column) -> Column:
)
> 1
)

@staticmethod
def generate_identifier(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to generate a unique identifier.
Args:
uniqueness_defining_columns (list[str]): list of columns defining uniqueness
Returns:
Column: column with a unique identifier
"""
hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None"))
.otherwise(f.col(column).cast("string"))
for column in uniqueness_defining_columns]
return f.md5(f.concat(*hashable_columns))
25 changes: 6 additions & 19 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,24 +447,18 @@ def _align_overlapping_tags(
)

@staticmethod
def assign_study_locus_id(
study_id_col: Column,
variant_id_col: Column,
finemapping_col: Column = None,
) -> Column:
"""Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId.
def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to extract a consistent studyLocusId.
Args:
study_id_col (Column): column name with a study ID
variant_id_col (Column): column name with a variant ID
finemapping_col (Column, optional): column with fine mapping methodology
uniqueness_defining_columns (list[str]): list of columns defining uniqueness
Returns:
Column: column with a study locus ID
Examples:
>>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show(truncate=False)
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False)
+----------+----------+-----------------+--------------------------------+
|studyId |variantId |finemappingMethod|study_locus_id |
+----------+----------+-----------------+--------------------------------+
Expand All @@ -473,15 +467,8 @@ def assign_study_locus_id(
+----------+----------+-----------------+--------------------------------+
<BLANKLINE>
"""
if finemapping_col is None:
finemapping_col = f.lit("None")
columns = [study_id_col, variant_id_col, finemapping_col]
hashable_columns = [f.when(column.cast("string").isNull(), f.lit("None"))
.otherwise(column.cast("string"))
for column in columns]
return f.md5(f.concat(*hashable_columns)).alias(
"studyLocusId"
)
return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId")


@classmethod
def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column:
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def from_susie_results(
.select(
*study_locus_cols,
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
StudyLocus.calculate_credible_set_log10bf(
f.col("locus.logBF")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/finngen/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def from_finngen_susie_finemapping(
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)

Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,7 +1188,7 @@ def update_study_id(
.drop("subStudyDescription", "updatedStudyId")
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
return self

Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parse_positive_curation(
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
.groupBy("studyLocusId", "studyId", "variantId", "geneId")
.agg(f.collect_set("source").alias("sources"))
Expand Down
17 changes: 11 additions & 6 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
study_locus_overlap = StudyLocus(
_df=self.credible_set.df.join(
f.broadcast(
self.gs_curation.select(
StudyLocus.assign_study_locus_id(
f.col("association_info.otg_id"), # studyId
f.concat_ws( # variantId
self.gs_curation
.withColumn(
"variantId",
f.concat_ws(
"_",
f.col("sentinel_variant.locus_GRCh38.chromosome"),
f.col("sentinel_variant.locus_GRCh38.position"),
f.col("sentinel_variant.alleles.reference"),
f.col("sentinel_variant.alleles.alternative"),
),
).alias("studyLocusId"),
)
)
.select(
StudyLocus.assign_study_locus_id(
["association_info.otg_id", # studyId
"variantId"]
),
)
),
"studyLocusId",
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/method/locus_breaker_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def locus_breaker(
.cast(t.ArrayType(t.StringType()))
.alias("qualityControls"),
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
).alias("studyLocusId"),
["studyId", "variantId"]
),
)
),
_schema=StudyLocus.get_schema(),
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/pics.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def finemap(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)
.drop("neglog_pvalue")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def clump(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
["studyId", "variantId"]
),
)
# Initialize QC column as array of strings:
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/susie_finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(
.df.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)
.collect()[0]
Expand Down Expand Up @@ -247,7 +247,7 @@ def susie_inf_to_studylocus(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)
.select(
Expand Down

0 comments on commit f1b0817

Please sign in to comment.