Merge branch 'main' into feat-code-of-conduct

d0choa · web-flow · commit 00fe0238bdda · 2023-12-12T15:43:08.000Z
diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml
@@ -32,23 +32,18 @@ variant_annotation: ${datasets.outputs}/variant_annotation
 variant_index: ${datasets.outputs}/variant_index
 study_locus: ${datasets.outputs}/study_locus
 credible_set: ${datasets.outputs}/credible_set
+study_index: ${datasets.outputs}/study_index
+summary_statistics: ${datasets.outputs}/summary_statistics
 study_locus_overlap: ${datasets.outputs}/study_locus_overlap
 colocalisation: ${datasets.outputs}/colocalisation
 v2g: ${datasets.outputs}/v2g
 ld_index: ${datasets.outputs}/ld_index
-catalog_study_index: ${datasets.outputs}/catalog_study_index
-catalog_study_locus: ${datasets.study_locus}/catalog_curated
-finngen_study_index: ${datasets.outputs}/finngen_study_index
-finngen_summary_stats: ${datasets.outputs}/finngen_summary_stats
+catalog_study_index: ${datasets.study_index}/catalog_curated
+catalog_study_locus: ${datasets.credible_set}/catalog_curated
+finngen_study_index: ${datasets.study_index}/finngen
+finngen_summary_stats: ${datasets.summary_statistics}/finngen
 from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
 from_sumstats_pics: ${datasets.credible_set}/from_sumstats
-ukbiobank_study_index: ${datasets.outputs}/ukbiobank_study_index
+ukbiobank_study_index: ${datasets.study_index}/ukbiobank
 l2g_model: ${datasets.outputs}/l2g_model
 l2g_predictions: ${datasets.outputs}/l2g_predictions
-eqtl_catalogue_study_index_out: ${datasets.outputs}/preprocess/eqtl_catalogue/study_index
-eqtl_catalogue_summary_stats_out: ${datasets.outputs}/preprocess/eqtl_catalogue/summary_stats
-
-# Constants
-finngen_release_prefix: FINNGEN_R9
-finngen_sumstat_url_prefix: gs://finngen-public-data-r9/summary_stats/finngen_R9_
-finngen_sumstat_url_suffix: .gz
diff --git a/config/step/finngen.yaml b/config/step/finngen.yaml
@@ -1,7 +1,3 @@
 _target_: otg.finngen.FinnGenStep
-finngen_phenotype_table_url: ${datasets.finngen_phenotype_table_url}
-finngen_release_prefix: ${datasets.finngen_release_prefix}
-finngen_sumstat_url_prefix: ${datasets.finngen_sumstat_url_prefix}
-finngen_sumstat_url_suffix: ${datasets.finngen_sumstat_url_suffix}
 finngen_study_index_out: ${datasets.finngen_study_index}
 finngen_summary_stats_out: ${datasets.finngen_summary_stats}
diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py
@@ -50,7 +50,7 @@
 # Shared DAG construction parameters.
 shared_dag_args = dict(
     owner="Open Targets Data Team",
-    retries=1,
+    retries=0,
 )
 shared_dag_kwargs = dict(
     tags=["genetics_etl", "experimental"],
@@ -68,6 +68,7 @@ def create_cluster(
     num_preemptible_workers: int = 0,
     num_local_ssds: int = 1,
     autoscaling_policy: str = GCP_AUTOSCALING_POLICY,
+    master_disk_size: int = 500,
 ) -> DataprocCreateClusterOperator:
     """Generate an Airflow task to create a Dataproc cluster. Common parameters are reused, and varying parameters can be specified as needed.
 
@@ -79,6 +80,7 @@ def create_cluster(
         num_preemptible_workers (int): Number of preemptible worker nodes. Defaults to 0.
         num_local_ssds (int): How many local SSDs to attach to each worker node, both primary and secondary. Defaults to 1.
         autoscaling_policy (str): Name of the autoscaling policy to use. Defaults to GCP_AUTOSCALING_POLICY.
+        master_disk_size (int): Size of the master node's boot disk in GB. Defaults to 500.
 
     Returns:
         DataprocCreateClusterOperator: Airflow task to create a Dataproc cluster.
@@ -89,7 +91,7 @@ def create_cluster(
         zone=GCP_ZONE,
         master_machine_type=master_machine_type,
         worker_machine_type=worker_machine_type,
-        master_disk_size=500,
+        master_disk_size=master_disk_size,
         worker_disk_size=500,
         num_preemptible_workers=num_preemptible_workers,
         num_workers=num_workers,
@@ -273,7 +275,6 @@ def delete_cluster(cluster_name: str) -> DataprocDeleteClusterOperator:
         cluster_name=cluster_name,
         region=GCP_REGION,
         trigger_rule=TriggerRule.ALL_DONE,
-        deferrable=True,
     )
 
 
diff --git a/src/airflow/dags/dag_genetics_etl.py b/src/airflow/dags/dag_genetics_etl.py
@@ -25,6 +25,7 @@
         this_task = common.submit_step(
             cluster_name=CLUSTER_NAME,
             step_id=step_id,
+            task_id=step_id,
         )
         # Chain prerequisites.
         tasks[step_id] = this_task
diff --git a/src/airflow/dags/dag_preprocess.py b/src/airflow/dags/dag_preprocess.py
@@ -8,7 +8,7 @@
 
 CLUSTER_NAME = "otg-preprocess"
 
-ALL_STEPS = ["finngen", "eqtl_catalogue", "ld_index", "variant_annotation"]
+ALL_STEPS = ["eqtl_catalogue", "ld_index", "variant_annotation"]
 
 
 with DAG(
@@ -18,7 +18,7 @@
     **common.shared_dag_kwargs,
 ):
     all_tasks = [
-        common.submit_step(cluster_name=CLUSTER_NAME, step_id=step)
+        common.submit_step(cluster_name=CLUSTER_NAME, step_id=step, task_id=step)
         for step in ALL_STEPS
     ]
     dag = common.generate_dag(cluster_name=CLUSTER_NAME, tasks=all_tasks)
diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py
@@ -0,0 +1,75 @@
+"""Airflow DAG for the Preprocess part of the pipeline."""
+from __future__ import annotations
+
+from pathlib import Path
+
+import common_airflow as common
+from airflow.models.dag import DAG
+from airflow.utils.trigger_rule import TriggerRule
+
+CLUSTER_NAME = "otg-preprocess-finngen"
+AUTOSCALING = "finngen-preprocess"
+
+RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX"
+SUMSTATS = "{RELEASEBUCKET}/summary_statistics/finngen"
+WINDOWBASED_CLUMPED = (
+    "{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_window_clumped/finngen"
+)
+LD_CLUMPED = "{RELEASEBUCKET}/study_locus/from_sumstats_study_locus_ld_clumped/finngen"
+PICSED = "{RELEASEBUCKET}/credible_set/from_sumstats_study_locus/finngen"
+
+with DAG(
+    dag_id=Path(__file__).stem,
+    description="Open Targets Genetics — Finngen preprocess",
+    default_args=common.shared_dag_args,
+    **common.shared_dag_kwargs,
+):
+    study_and_sumstats = common.submit_step(
+        cluster_name=CLUSTER_NAME,
+        step_id="finngen",
+        task_id="finngen_sumstats_and_study_index",
+    )
+
+    window_based_clumping = common.submit_step(
+        cluster_name=CLUSTER_NAME,
+        step_id="clump",
+        task_id="finngen_window_based_clumping",
+        other_args=[
+            "step.input_path={SUMSTATS}",
+            "step.clumped_study_locus_path={WINDOWBASED_CLUMPED}",
+        ],
+    )
+    ld_clumping = common.submit_step(
+        cluster_name=CLUSTER_NAME,
+        step_id="clump",
+        task_id="finngen_ld_clumping",
+        other_args=[
+            "step.input_path={WINDOWBASED_CLUMPED}",
+            "step.clumped_study_locus_path={LD_CLUMPED}",
+        ],
+        trigger_rule=TriggerRule.ALL_DONE,
+    )
+
+    pics = common.submit_step(
+        cluster_name=CLUSTER_NAME,
+        step_id="pics",
+        task_id="finngen_pics",
+        other_args=[
+            f"step.study_locus_ld_annotated_in={LD_CLUMPED}",
+            f"step.picsed_study_locus_out={PICSED}",
+        ],
+        # This allows to attempt running the task when above step fails do to failifexists
+        trigger_rule=TriggerRule.ALL_DONE,
+    )
+
+    (
+        common.create_cluster(
+            CLUSTER_NAME, autoscaling_policy=AUTOSCALING, master_disk_size=2000
+        )
+        >> common.install_dependencies(CLUSTER_NAME)
+        >> study_and_sumstats
+        >> window_based_clumping
+        >> ld_clumping
+        >> pics
+        >> common.delete_cluster(CLUSTER_NAME)
+    )
diff --git a/src/otg/dataset/study_locus.py b/src/otg/dataset/study_locus.py
@@ -13,14 +13,14 @@
     order_array_of_structs_by_field,
 )
 from otg.dataset.dataset import Dataset
-from otg.dataset.ld_index import LDIndex
 from otg.dataset.study_locus_overlap import StudyLocusOverlap
 from otg.method.clump import LDclumping
 
 if TYPE_CHECKING:
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
+    from otg.dataset.ld_index import LDIndex
     from otg.dataset.study_index import StudyIndex
 
 
diff --git a/src/otg/dataset/summary_statistics.py b/src/otg/dataset/summary_statistics.py
@@ -65,10 +65,10 @@ def window_based_clumping(
         """Generate study-locus from summary statistics by distance based clumping + collect locus.
 
         Args:
-            distance (int): Distance in base pairs to be used for clumping.
+            distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
             gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.
             baseline_significance (float, optional): Baseline significance threshold for inclusion in the locus. Defaults to 0.05.
-            locus_collect_distance (int | None): The distance to collect locus around semi-indices.
+            locus_collect_distance (int | None): The distance to collect locus around semi-indices. If not provided, locus is not collected.
 
         Returns:
             StudyLocus: Clumped study-locus containing variants based on window.
diff --git a/src/otg/datasource/finngen/study_index.py b/src/otg/datasource/finngen/study_index.py
@@ -1,15 +1,13 @@
 """Study Index for Finngen data source."""
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from urllib.request import urlopen
 
 import pyspark.sql.functions as f
+from pyspark.sql import SparkSession
 
 from otg.dataset.study_index import StudyIndex
 
-if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
-
 
 class FinnGenStudyIndex:
     """Study index dataset from FinnGen.
@@ -24,35 +22,39 @@ class FinnGenStudyIndex:
     Some fields are also populated as constants, such as study type and the initial sample size.
     """
 
+    finngen_phenotype_table_url: str = "https://r9.finngen.fi/api/phenos"
+    finngen_release_prefix: str = "FINNGEN_R9"
+    finngen_summary_stats_url_prefix: str = (
+        "gs://finngen-public-data-r9/summary_stats/finngen_R9_"
+    )
+    finngen_summary_stats_url_suffix: str = ".gz"
+
     @classmethod
     def from_source(
         cls: type[FinnGenStudyIndex],
-        finngen_studies: DataFrame,
-        finngen_release_prefix: str,
-        finngen_summary_stats_url_prefix: str,
-        finngen_summary_stats_url_suffix: str,
+        spark: SparkSession,
     ) -> StudyIndex:
         """This function ingests study level metadata from FinnGen.
 
         Args:
-            finngen_studies (DataFrame): FinnGen raw study table
-            finngen_release_prefix (str): Release prefix pattern.
-            finngen_summary_stats_url_prefix (str): URL prefix for summary statistics location.
-            finngen_summary_stats_url_suffix (str): URL prefix suffix for summary statistics location.
+            spark (SparkSession): Spark session object.
 
         Returns:
             StudyIndex: Parsed and annotated FinnGen study table.
         """
+        json_data = urlopen(cls.finngen_phenotype_table_url).read().decode("utf-8")
+        rdd = spark.sparkContext.parallelize([json_data])
+        raw_df = spark.read.json(rdd)
         return StudyIndex(
-            _df=finngen_studies.select(
-                f.concat(f.lit(f"{finngen_release_prefix}_"), f.col("phenocode")).alias(
-                    "studyId"
-                ),
+            _df=raw_df.select(
+                f.concat(
+                    f.lit(f"{cls.finngen_release_prefix}_"), f.col("phenocode")
+                ).alias("studyId"),
                 f.col("phenostring").alias("traitFromSource"),
                 f.col("num_cases").alias("nCases"),
                 f.col("num_controls").alias("nControls"),
                 (f.col("num_cases") + f.col("num_controls")).alias("nSamples"),
-                f.lit(finngen_release_prefix).alias("projectId"),
+                f.lit(cls.finngen_release_prefix).alias("projectId"),
                 f.lit("gwas").alias("studyType"),
                 f.lit(True).alias("hasSumstats"),
                 f.lit("377,277 (210,870 females and 166,407 males)").alias(
@@ -67,9 +69,9 @@ def from_source(
                 # Cohort label is consistent with GWAS Catalog curation.
                 f.array(f.lit("FinnGen")).alias("cohorts"),
                 f.concat(
-                    f.lit(finngen_summary_stats_url_prefix),
+                    f.lit(cls.finngen_summary_stats_url_prefix),
                     f.col("phenocode"),
-                    f.lit(finngen_summary_stats_url_suffix),
+                    f.lit(cls.finngen_summary_stats_url_suffix),
                 ).alias("summarystatsLocation"),
             ).withColumn(
                 "ldPopulationStructure",
diff --git a/src/otg/datasource/finngen/summary_stats.py b/src/otg/datasource/finngen/summary_stats.py
@@ -3,38 +3,59 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
 
 import pyspark.sql.functions as f
 import pyspark.sql.types as t
+from pyspark.sql import SparkSession
+from pyspark.sql.types import StringType, StructField, StructType
 
 from otg.common.utils import parse_pvalue
 from otg.dataset.summary_statistics import SummaryStatistics
 
-if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
-
 
 @dataclass
 class FinnGenSummaryStats:
     """Summary statistics dataset for FinnGen."""
 
+    raw_schema: t.StructType = StructType(
+        [
+            StructField("#chrom", StringType(), True),
+            StructField("pos", StringType(), True),
+            StructField("ref", StringType(), True),
+            StructField("alt", StringType(), True),
+            StructField("rsids", StringType(), True),
+            StructField("nearest_genes", StringType(), True),
+            StructField("pval", StringType(), True),
+            StructField("mlogp", StringType(), True),
+            StructField("beta", StringType(), True),
+            StructField("sebeta", StringType(), True),
+            StructField("af_alt", StringType(), True),
+            StructField("af_alt_cases", StringType(), True),
+            StructField("af_alt_controls", StringType(), True),
+        ]
+    )
+
     @classmethod
     def from_source(
         cls: type[FinnGenSummaryStats],
-        summary_stats_df: DataFrame,
+        spark: SparkSession,
+        raw_files: list[str],
     ) -> SummaryStatistics:
         """Ingests all summary statst for all FinnGen studies.
 
         Args:
-            summary_stats_df (DataFrame): Raw summary statistics dataframe
+            spark (SparkSession): Spark session object.
+            raw_files (list[str]): Paths to raw summary statistics .gz files.
 
         Returns:
             SummaryStatistics: Processed summary statistics dataset
         """
         processed_summary_stats_df = (
-            summary_stats_df
+            spark.read.schema(cls.raw_schema)
+            .option("delimiter", "\t")
+            .csv(raw_files, header=True)
             # Drop rows which don't have proper position.
+            .filter(f.col("pos").cast(t.IntegerType()).isNotNull())
             .select(
                 # From the full path, extracts just the filename, and converts to upper case to get the study ID.
                 f.upper(f.regexp_extract(f.input_file_name(), r"([^/]+)\.gz", 1)).alias(
diff --git a/src/otg/finngen.py b/src/otg/finngen.py
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/dataset/test_study_locus.py b/tests/dataset/test_study_locus.py
diff --git a/tests/datasource/finngen/test_finngen_study_index.py b/tests/datasource/finngen/test_finngen_study_index.py
diff --git a/tests/datasource/finngen/test_finngen_summary_stats.py b/tests/datasource/finngen/test_finngen_summary_stats.py

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`this_task = common.submit_step(`
`26`	`26`	`cluster_name=CLUSTER_NAME,`
`27`	`27`	`step_id=step_id,`
	`28`	`+ task_id=step_id,`
`28`	`29`	`)`
`29`	`30`	`# Chain prerequisites.`
`30`	`31`	`tasks[step_id] = this_task`