Skip to content

Commit 7dfce61

Browse files
chore: review study locus and study index configs (#326)
* chore: make studylocus and study indices configs clearer * chore: temporarily turn off removal of redundancies due to perf * refactor: read studyindex and studylocus recursively
1 parent 923684c commit 7dfce61

File tree

12 files changed

+39
-26
lines changed

12 files changed

+39
-26
lines changed

config/datasets/gcp.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@ eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabi
3030
gene_index: ${datasets.outputs}/gene_index
3131
variant_annotation: ${datasets.outputs}/variant_annotation
3232
variant_index: ${datasets.outputs}/variant_index
33+
study_index: ${datasets.outputs}/study_index
3334
study_locus: ${datasets.outputs}/study_locus
3435
credible_set: ${datasets.outputs}/credible_set
35-
study_index: ${datasets.outputs}/study_index
3636
summary_statistics: ${datasets.outputs}/summary_statistics
3737
study_locus_overlap: ${datasets.outputs}/study_locus_overlap
3838
colocalisation: ${datasets.outputs}/colocalisation
3939
v2g: ${datasets.outputs}/v2g
4040
ld_index: ${datasets.outputs}/ld_index
41-
catalog_study_index: ${datasets.study_index}/catalog_curated
42-
catalog_study_locus: ${datasets.credible_set}/catalog_curated
41+
catalog_study_index: ${datasets.study_index}/catalog
42+
catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
4343
finngen_study_index: ${datasets.study_index}/finngen
4444
finngen_summary_stats: ${datasets.summary_statistics}/finngen
4545
from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats

config/step/locus_to_gene.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ predictions_path: ${datasets.l2g_predictions}
1111
credible_set_path: ${datasets.credible_set}
1212
variant_gene_path: ${datasets.v2g}
1313
colocalisation_path: ${datasets.colocalisation}
14-
study_index_path: ${datasets.catalog_study_index}
14+
study_index_path: ${datasets.study_index}
1515
study_locus_overlap_path: ${datasets.study_locus_overlap}
1616
gold_standard_curation_path: ${datasets.l2g_gold_standard_curation}
1717
gene_interactions_path: ${datasets.gene_interactions}

config/step/pics.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
_target_: otg.pics.PICSStep
2-
study_locus_ld_annotated_in: ${datasets.from_sumstats_study_locus}
3-
picsed_study_locus_out: ${datasets.from_sumstats_pics}
2+
study_locus_ld_annotated_in: ???
3+
picsed_study_locus_out: ???

config/step/study_locus_overlap.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
_target_: otg.overlaps.OverlapsIndexStep
2-
study_locus_path: ${datasets.outputs}/catalog_study_locus
3-
study_index_path: ${datasets.outputs}/catalog_study_index
2+
study_locus_path: ${datasets.outputs}/credible_set
3+
study_index_path: ${datasets.outputs}/study_index
44
overlaps_index_out: ${datasets.outputs}/study_locus_overlap

config/step/variant_index.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
_target_: otg.variant_index.VariantIndexStep
22
variant_annotation_path: ${datasets.variant_annotation}
3-
study_locus_path: ${datasets.study_locus}
3+
credible_set_path: ${datasets.study_locus}
44
variant_index_path: ${datasets.variant_index}

src/airflow/dags/configs/dag.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,12 @@
77
prerequisites:
88
- "variant_index"
99
- "gene_index"
10-
- id: "ukbiobank"
1110
- id: "study_locus_overlap"
1211
prerequisites:
1312
- "gwas_catalog"
14-
- "ukbiobank"
1513
- id: "locus_to_gene"
1614
prerequisites:
1715
- "gwas_catalog"
18-
- "ukbiobank"
1916
- "variant_index"
2017
- "v2g"
2118
- "study_locus_overlap"

src/airflow/dags/dag_preprocess.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@
88

99
CLUSTER_NAME = "otg-preprocess"
1010

11-
ALL_STEPS = ["eqtl_catalogue", "ld_index", "variant_annotation"]
11+
ALL_STEPS = [
12+
"finngen",
13+
"eqtl_catalogue",
14+
"ld_index",
15+
"variant_annotation",
16+
"ukbiobank",
17+
]
1218

1319

1420
with DAG(

src/otg/dataset/l2g_gold_standard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def from_otg_curation(
5454

5555
return (
5656
OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g)
57-
.filter_unique_associations(study_locus_overlap)
57+
# .filter_unique_associations(study_locus_overlap)
5858
.remove_false_negatives(interactions_df)
5959
)
6060

src/otg/l2g.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from dataclasses import dataclass, field
55
from typing import Any
66

7+
import pyspark.sql.functions as f
78
import sklearn
89
from omegaconf import MISSING
910
from xgboost.spark import SparkXGBClassifier
@@ -114,7 +115,9 @@ def __post_init__(self: LocusToGeneStep) -> None:
114115
credible_set = StudyLocus.from_parquet(
115116
self.session, self.credible_set_path, recursiveFileLookup=True
116117
)
117-
studies = StudyIndex.from_parquet(self.session, self.study_index_path)
118+
studies = StudyIndex.from_parquet(
119+
self.session, self.study_index_path, recursiveFileLookup=True
120+
)
118121
v2g = V2G.from_parquet(self.session, self.variant_gene_path)
119122
# coloc = Colocalisation.from_parquet(self.session, self.colocalisation_path) # TODO: run step
120123

@@ -142,8 +145,12 @@ def __post_init__(self: LocusToGeneStep) -> None:
142145

143146
# Join and fill null values with 0
144147
data = L2GFeatureMatrix(
145-
_df=gold_standards.df.drop("sources").join(
146-
fm.df, on=["studyLocusId", "geneId"], how="inner"
148+
_df=fm.df.join(
149+
f.broadcast(
150+
gold_standards.df.drop("variantId", "studyId", "sources")
151+
),
152+
on=["studyLocusId", "geneId"],
153+
how="inner",
147154
),
148155
_schema=L2GFeatureMatrix.get_schema(),
149156
).fill_na()
@@ -168,7 +175,7 @@ def __post_init__(self: LocusToGeneStep) -> None:
168175
)
169176
else:
170177
# Train model
171-
model = LocusToGeneTrainer.train(
178+
LocusToGeneTrainer.train(
172179
data=data,
173180
l2g_model=l2g_model,
174181
features_list=list(self.features_list),
@@ -177,7 +184,6 @@ def __post_init__(self: LocusToGeneStep) -> None:
177184
wandb_run_name=self.wandb_run_name,
178185
**self.hyperparameters,
179186
)
180-
model.save(self.model_path)
181187
self.session.logger.info(
182188
f"Finished L2G step. L2G model saved to {self.model_path}"
183189
)

src/otg/method/l2g/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def evaluate(
190190
hyperparameters: dict[str, Any],
191191
wandb_run_name: str | None,
192192
) -> None:
193-
"""Perform evaluation of the model by applying it to a test set and tracking the results with W&B.
193+
"""Perform evaluation of the model predictions for the test set and track the results with W&B.
194194
195195
Args:
196196
results (DataFrame): Dataframe containing the predictions

src/otg/overlaps.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,12 @@ class OverlapsIndexStep:
3535
def __post_init__(self: OverlapsIndexStep) -> None:
3636
"""Run step."""
3737
# Extract
38-
study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)
39-
study_index = StudyIndex.from_parquet(self.session, self.study_index_path)
38+
study_locus = StudyLocus.from_parquet(
39+
self.session, self.study_locus_path, recursiveFileLookup=True
40+
)
41+
study_index = StudyIndex.from_parquet(
42+
self.session, self.study_index_path, recursiveFileLookup=True
43+
)
4044
# Transform
4145
overlaps_index = StudyLocusOverlap.from_associations(study_locus, study_index)
4246
# Load

src/otg/variant_index.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,19 @@ class VariantIndexStep:
2626

2727
session: Session = MISSING
2828
variant_annotation_path: str = MISSING
29-
study_locus_path: str = MISSING
29+
credible_set_path: str = MISSING
3030
variant_index_path: str = MISSING
3131

3232
def __post_init__(self: VariantIndexStep) -> None:
3333
"""Run step."""
3434
# Extract
3535
va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
36-
study_locus = StudyLocus.from_parquet(
37-
self.session, self.study_locus_path, recursiveFileLookup=True
36+
credible_set = StudyLocus.from_parquet(
37+
self.session, self.credible_set_path, recursiveFileLookup=True
3838
)
3939

4040
# Transform
41-
vi = VariantIndex.from_variant_annotation(va, study_locus)
41+
vi = VariantIndex.from_variant_annotation(va, credible_set)
4242

4343
# Load
4444
self.session.logger.info(f"Writing variant index to: {self.variant_index_path}")

0 commit comments

Comments
 (0)