Skip to content

Commit

Permalink
Merge branch 'dev' into do_out_sample_qc
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges authored Sep 30, 2024
2 parents ad452b4 + 5c58e58 commit 4e17a14
Show file tree
Hide file tree
Showing 28 changed files with 177 additions and 183 deletions.
4 changes: 2 additions & 2 deletions src/gentropy/assets/schemas/colocalisation.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
{
"name": "leftStudyLocusId",
"nullable": false,
"type": "long",
"type": "string",
"metadata": {}
},
{
"name": "rightStudyLocusId",
"nullable": false,
"type": "long",
"type": "string",
"metadata": {}
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_feature.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_gold_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/l2g_predictions.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"fields": [
{
"name": "studyLocusId",
"type": "long",
"type": "string",
"nullable": false,
"metadata": {}
},
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/assets/schemas/study_locus.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"metadata": {},
"name": "studyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/assets/schemas/study_locus_overlap.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
"metadata": {},
"name": "leftStudyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
"name": "rightStudyLocusId",
"nullable": false,
"type": "long"
"type": "string"
},
{
"metadata": {},
Expand Down
15 changes: 15 additions & 0 deletions src/gentropy/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,18 @@ def flag_duplicates(test_column: Column) -> Column:
)
> 1
)

@staticmethod
def generate_identifier(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to generate a unique identifier.
Args:
uniqueness_defining_columns (list[str]): list of columns defining uniqueness
Returns:
Column: column with a unique identifier
"""
hashable_columns = [f.when(f.col(column).cast("string").isNull(), f.lit("None"))
.otherwise(f.col(column).cast("string"))
for column in uniqueness_defining_columns]
return f.md5(f.concat(*hashable_columns))
34 changes: 12 additions & 22 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,38 +448,28 @@ def _align_overlapping_tags(
)

@staticmethod
def assign_study_locus_id(
study_id_col: Column,
variant_id_col: Column,
finemapping_col: Column = None,
) -> Column:
"""Hashes a column with a variant ID and a study ID to extract a consistent studyLocusId.
def assign_study_locus_id(uniqueness_defining_columns: list[str]) -> Column:
"""Hashes the provided columns to extract a consistent studyLocusId.
Args:
study_id_col (Column): column name with a study ID
variant_id_col (Column): column name with a variant ID
finemapping_col (Column, optional): column with fine mapping methodology
uniqueness_defining_columns (list[str]): list of columns defining uniqueness
Returns:
Column: column with a study locus ID
Examples:
>>> df = spark.createDataFrame([("GCST000001", "1_1000_A_C", "SuSiE-inf"), ("GCST000002", "1_1000_A_C", "pics")]).toDF("studyId", "variantId", "finemappingMethod")
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId"), f.col("finemappingMethod"))).show()
+----------+----------+-----------------+-------------------+
| studyId| variantId|finemappingMethod| study_locus_id|
+----------+----------+-----------------+-------------------+
|GCST000001|1_1000_A_C| SuSiE-inf|3801266831619496075|
|GCST000002|1_1000_A_C| pics|1581844826999194430|
+----------+----------+-----------------+-------------------+
>>> df.withColumn("study_locus_id", StudyLocus.assign_study_locus_id(["studyId", "variantId", "finemappingMethod"])).show(truncate=False)
+----------+----------+-----------------+--------------------------------+
|studyId |variantId |finemappingMethod|study_locus_id |
+----------+----------+-----------------+--------------------------------+
|GCST000001|1_1000_A_C|SuSiE-inf |109804fe1e20c94231a31bafd71b566e|
|GCST000002|1_1000_A_C|pics |de310be4558e0482c9cc359c97d37773|
+----------+----------+-----------------+--------------------------------+
<BLANKLINE>
"""
if finemapping_col is None:
finemapping_col = f.lit(None).cast(StringType())
variant_id_col = f.coalesce(variant_id_col, f.rand().cast("string"))
return f.xxhash64(study_id_col, variant_id_col, finemapping_col).alias(
"studyLocusId"
)
return Dataset.generate_identifier(uniqueness_defining_columns).alias("studyLocusId")


@classmethod
def calculate_credible_set_log10bf(cls: type[StudyLocus], logbfs: Column) -> Column:
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/eqtl_catalogue/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def from_susie_results(
.select(
*study_locus_cols,
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
StudyLocus.calculate_credible_set_log10bf(
f.col("locus.logBF")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/finngen/finemapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ def from_finngen_susie_finemapping(
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)

Expand Down
6 changes: 3 additions & 3 deletions src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import TYPE_CHECKING

import pyspark.sql.functions as f
from pyspark.sql.types import DoubleType, FloatType, IntegerType, LongType
from pyspark.sql.types import DoubleType, FloatType, IntegerType, StringType
from pyspark.sql.window import Window

from gentropy.assets import data
Expand Down Expand Up @@ -1109,7 +1109,7 @@ def from_source(
"""
return StudyLocusGWASCatalog(
_df=gwas_associations.withColumn(
"studyLocusId", f.monotonically_increasing_id().cast(LongType())
"studyLocusId", f.monotonically_increasing_id().cast(StringType())
)
.transform(
# Map/harmonise variants to variant annotation dataset:
Expand Down Expand Up @@ -1188,7 +1188,7 @@ def update_study_id(
.drop("subStudyDescription", "updatedStudyId")
).withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
return self

Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/datasource/open_targets/l2g_gold_standard.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parse_positive_curation(
)
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
)
.groupBy("studyLocusId", "studyId", "variantId", "geneId")
.agg(f.collect_set("source").alias("sources"))
Expand Down
17 changes: 11 additions & 6 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,22 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
study_locus_overlap = StudyLocus(
_df=self.credible_set.df.join(
f.broadcast(
self.gs_curation.select(
StudyLocus.assign_study_locus_id(
f.col("association_info.otg_id"), # studyId
f.concat_ws( # variantId
self.gs_curation
.withColumn(
"variantId",
f.concat_ws(
"_",
f.col("sentinel_variant.locus_GRCh38.chromosome"),
f.col("sentinel_variant.locus_GRCh38.position"),
f.col("sentinel_variant.alleles.reference"),
f.col("sentinel_variant.alleles.alternative"),
),
).alias("studyLocusId"),
)
)
.select(
StudyLocus.assign_study_locus_id(
["association_info.otg_id", # studyId
"variantId"]
),
)
),
"studyLocusId",
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/method/locus_breaker_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ def locus_breaker(
.cast(t.ArrayType(t.StringType()))
.alias("qualityControls"),
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
).alias("studyLocusId"),
["studyId", "variantId"]
),
)
),
_schema=StudyLocus.get_schema(),
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/pics.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def finemap(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
"studyId", "variantId", "finemappingMethod"
["studyId", "variantId", "finemappingMethod"]
),
)
.drop("neglog_pvalue")
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/method/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def clump(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId")
["studyId", "variantId"]
),
)
# Initialize QC column as array of strings:
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/susie_finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(
.df.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
"studyId", "variantId", "finemappingMethod"
["studyId", "variantId", "finemappingMethod"]
),
)
.collect()[0]
Expand Down Expand Up @@ -247,7 +247,7 @@ def susie_inf_to_studylocus(
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(
f.col("studyId"), f.col("variantId"), f.col("finemappingMethod")
["studyId", "variantId", "finemappingMethod"]
),
)
.select(
Expand Down
6 changes: 3 additions & 3 deletions tests/gentropy/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,10 +617,10 @@ def mock_l2g_feature_matrix(spark: SparkSession) -> L2GFeatureMatrix:
return L2GFeatureMatrix(
_df=spark.createDataFrame(
[
(1, "gene1", 100.0, None),
(2, "gene2", 1000.0, 0.0),
("1", "gene1", 100.0, None),
("2", "gene2", 1000.0, 0.0),
],
"studyLocusId LONG, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
"studyLocusId STRING, geneId STRING, distanceTssMean FLOAT, distanceTssMinimum FLOAT",
),
with_gold_standard=False,
)
Expand Down
6 changes: 3 additions & 3 deletions tests/gentropy/dataset/test_colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
_df=spark.createDataFrame(
[
(
1,
"1",
"var1",
"gwas1",
),
(
2,
"2",
"var2",
"eqtl1",
),
Expand All @@ -100,7 +100,7 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None:
)
self.sample_colocalisation = Colocalisation(
_df=spark.createDataFrame(
[(1, 2, "eqtl", "X", "COLOC", 1, 0.9)],
[("1", "2", "eqtl", "X", "COLOC", 1, 0.9)],
[
"leftStudyLocusId",
"rightStudyLocusId",
Expand Down
Loading

0 comments on commit 4e17a14

Please sign in to comment.