From cf2be5b7568926de8bcbdc2737a22d650e633a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 10 Nov 2023 14:39:12 +0000 Subject: [PATCH 01/25] chore: changes in config --- config/datasets/gcp.yaml | 17 +++++++++++------ config/step/locus_to_gene.yaml | 2 +- src/airflow/dags/configs/dag.yaml | 3 --- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml index fa5b10fad..ad80e4e65 100644 --- a/config/datasets/gcp.yaml +++ b/config/datasets/gcp.yaml @@ -27,19 +27,24 @@ gene_index: ${datasets.outputs}/gene_index variant_annotation: ${datasets.outputs}/variant_annotation variant_index: ${datasets.outputs}/variant_index study_locus: ${datasets.outputs}/study_locus +credible_set: ${datasets.outputs}/credible_set +study_index: ${datasets.outputs}/study_index +summary_statistics: ${datasets.outputs}/summary_statistics study_locus_overlap: ${datasets.outputs}/study_locus_overlap colocalisation: ${datasets.outputs}/colocalisation v2g: ${datasets.outputs}/v2g ld_index: ${datasets.outputs}/ld_index -catalog_study_index: ${datasets.outputs}/catalog_study_index -catalog_study_locus: ${datasets.study_locus}/catalog_study_locus -finngen_study_index: ${datasets.outputs}/finngen_study_index -finngen_summary_stats: ${datasets.outputs}/finngen_summary_stats -ukbiobank_study_index: ${datasets.outputs}/ukbiobank_study_index +catalog_study_index: ${datasets.study_index}/catalog_curated +catalog_study_locus: ${datasets.credible_set}/catalog_curated +finngen_study_index: ${datasets.study_index}/finngen +finngen_summary_stats: ${datasets.summary_statistics}/finngen +from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats +from_sumstats_pics: ${datasets.credible_set}/from_sumstats +ukbiobank_study_index: ${datasets.study_index}/ukbiobank l2g_model: ${datasets.outputs}/l2g_model l2g_predictions: ${datasets.outputs}/l2g_predictions # Constants finngen_release_prefix: FINNGEN_R9 -finngen_sumstat_url_prefix: https://storage.googleapis.com/finngen-public-data-r9/summary_stats/finngen_R9_ +finngen_sumstat_url_prefix: gs://finngen-public-data-r9/summary_stats/finngen_R9_ finngen_sumstat_url_suffix: .gz diff --git a/config/step/locus_to_gene.yaml b/config/step/locus_to_gene.yaml index 47f014c55..9f7004e17 100644 --- a/config/step/locus_to_gene.yaml +++ b/config/step/locus_to_gene.yaml @@ -9,7 +9,7 @@ wandb_run_name: null perform_cross_validation: false model_path: ${datasets.l2g_model} predictions_path: ${datasets.l2g_predictions} -study_locus_path: ${datasets.study_locus} +study_locus_path: ${datasets.credible_set} variant_gene_path: ${datasets.v2g} colocalisation_path: ${datasets.colocalisation} study_index_path: ${datasets.catalog_study_index} diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml index 30a7c3827..d7ffabc03 100644 --- a/src/airflow/dags/configs/dag.yaml +++ b/src/airflow/dags/configs/dag.yaml @@ -7,17 +7,14 @@ prerequisites: - "variant_index" - "gene_index" -- id: "finngen" - id: "ukbiobank" - id: "study_locus_overlap" prerequisites: - "gwas_catalog" - - "finngen" - "ukbiobank" - id: "locus_to_gene" prerequisites: - "gwas_catalog" - - "finngen" - "ukbiobank" - "variant_index" - "v2g" From 44766d22febfb2ae2d2ed4224106c90128ffc5b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Thu, 16 Nov 2023 17:33:41 +0000 Subject: [PATCH 02/25] fix: change definition of negative l2g evidence --- .../open_targets/l2g_gold_standard.py | 147 +++++++++--------- 1 file changed, 76 insertions(+), 71 deletions(-) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 46ec21502..4bf89732b 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -62,78 +62,83 @@ def as_l2g_gold_standard( "leftStudyLocusId", "rightStudyLocusId" ) interactions_df = cls.process_gene_interactions(interactions) - return L2GGoldStandard( - _df=( - gold_standard_curation.filter( - f.col("gold_standard_info.highest_confidence").isin( - ["High", "Medium"] - ) - ) - .select( - f.col("association_info.otg_id").alias("studyId"), - f.col("gold_standard_info.gene_id").alias("geneId"), - f.concat_ws( - "_", - f.col("sentinel_variant.locus_GRCh38.chromosome"), - f.col("sentinel_variant.locus_GRCh38.position"), - f.col("sentinel_variant.alleles.reference"), - f.col("sentinel_variant.alleles.alternative"), - ).alias("variantId"), - f.col("metadata.set_label").alias("source"), - ) - .withColumn( - "studyLocusId", - StudyLocus.assign_study_locus_id( - f.col("studyId"), f.col("variantId") - ), - ) - .groupBy("studyLocusId", "studyId", "variantId", "geneId") - .agg( - f.collect_set("source").alias("sources"), - ) - # Assign Positive or Negative Status based on confidence - .join( - v2g.df.filter(f.col("distance").isNotNull()).select( - "variantId", "geneId", "distance" - ), - on=["variantId", "geneId"], - how="inner", - ) - .withColumn( - "goldStandardSet", - f.when(f.col("distance") <= 500_000, f.lit("positive")).otherwise( - f.lit("negative") - ), - ) - # Remove redundant loci by testing they are truly independent - .alias("left") - .join( - overlaps_df.alias("right"), - (f.col("left.variantId") == f.col("right.leftStudyLocusId")) - | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), - how="left", - ) - .distinct() - # Remove redundant genes by testing they do not interact with a positive gene - .join( - interactions_df.alias("interactions"), - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")), - how="left", - ) - .withColumn("interacting", (f.col("score") > 0.7)) - # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting - .filter( - ~( - (f.col("goldStandardSet") == 0) - & (f.col("interacting")) - & ( - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")) - ) + + positive_set = ( + gold_standard_curation.filter( + f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"]) + ) + .select( + f.col("association_info.otg_id").alias("studyId"), + f.col("gold_standard_info.gene_id").alias("geneId"), + f.concat_ws( + "_", + f.col("sentinel_variant.locus_GRCh38.chromosome"), + f.col("sentinel_variant.locus_GRCh38.position"), + f.col("sentinel_variant.alleles.reference"), + f.col("sentinel_variant.alleles.alternative"), + ).alias("variantId"), + f.col("metadata.set_label").alias("source"), + ) + .withColumn( + "studyLocusId", + StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), + ) + .groupBy("studyLocusId", "studyId", "variantId", "geneId") + .agg( + f.collect_set("source").alias("sources"), + ) + ) + + full_set = ( + # Bring negative evidence based on genes that are in the vicinity of the locus but are not part of the positive set + positive_set.alias("positives") + .join( + v2g.df.filter(f.col("distance") <= 500_000) + .select("variantId", "geneId", "distance") + .alias("negatives"), + on="variantId", + how="left", + ) + # Assign set label + .withColumn( + "goldStandardSet", + f.when( + (f.col("positives.geneId") == f.col("negatives.geneId")) + # to keep the positives that are outside the v2g dataset + | (f.col("negatives.geneId").isNull()), + f.lit("positive"), + ).otherwise("negative"), + ) + # Remove redundant loci by testing they are truly independent + .alias("left") + .join( + overlaps_df.alias("right"), + (f.col("left.variantId") == f.col("right.leftStudyLocusId")) + | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), + how="left", + ) + .distinct() + # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting + .join( + interactions_df.alias("interactions"), + (f.col("left.geneId") == f.col("interactions.geneIdA")) + | (f.col("left.geneId") == f.col("interactions.geneIdB")), + how="left", + ) + .withColumn("interacting", (f.col("score") > 0.7)) + .filter( + ~( + (f.col("goldStandardSet") == 0) + & (f.col("interacting")) + & ( + (f.col("left.geneId") == f.col("interactions.geneIdA")) + | (f.col("left.geneId") == f.col("interactions.geneIdB")) ) ) - .select("studyLocusId", "geneId", "goldStandardSet", "sources") - ), + ) + .select("studyLocusId", "geneId", "goldStandardSet", "sources") + ) + return L2GGoldStandard( + _df=full_set, _schema=L2GGoldStandard.get_schema(), ) From e17df5b84d705faec1c021a8691cf80ede7c4fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 09:25:31 +0000 Subject: [PATCH 03/25] refactor: modularise logic for gold standards --- .../open_targets/l2g_gold_standard.py | 165 ++++++++++++------ 1 file changed, 107 insertions(+), 58 deletions(-) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 4bf89732b..f5e2a0b25 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -15,8 +15,8 @@ class OpenTargetsL2GGoldStandard: """Parser for OTGenetics locus to gene gold standards curation. The curation is processed to generate a dataset with 2 labels: - - Gold Standard Positive (GSP): Variant is within 500kb of gene - - Gold Standard Negative (GSN): Variant is not within 500kb of gene + - Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations. + - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS. """ @staticmethod @@ -39,31 +39,17 @@ def process_gene_interactions(interactions: DataFrame) -> DataFrame: "scoring as score", ) - @classmethod - def as_l2g_gold_standard( - cls: type[OpenTargetsL2GGoldStandard], - gold_standard_curation: DataFrame, - v2g: V2G, - study_locus_overlap: StudyLocusOverlap, - interactions: DataFrame, - ) -> L2GGoldStandard: - """Initialise L2GGoldStandard from source dataset. + @staticmethod + def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame: + """Parse positive set from gold standard curation. Args: - gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards - v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS - study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci - interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene + gold_standard_curation (DataFrame): Gold standard curation dataframe Returns: - L2GGoldStandard: L2G Gold Standard dataset + DataFrame: Positive set """ - overlaps_df = study_locus_overlap._df.select( - "leftStudyLocusId", "rightStudyLocusId" - ) - interactions_df = cls.process_gene_interactions(interactions) - - positive_set = ( + return ( gold_standard_curation.filter( f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"]) ) @@ -84,48 +70,56 @@ def as_l2g_gold_standard( StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")), ) .groupBy("studyLocusId", "studyId", "variantId", "geneId") - .agg( - f.collect_set("source").alias("sources"), - ) + .agg(f.collect_set("source").alias("sources")) ) - full_set = ( - # Bring negative evidence based on genes that are in the vicinity of the locus but are not part of the positive set - positive_set.alias("positives") - .join( - v2g.df.filter(f.col("distance") <= 500_000) - .select("variantId", "geneId", "distance") - .alias("negatives"), - on="variantId", - how="left", - ) - # Assign set label - .withColumn( - "goldStandardSet", - f.when( - (f.col("positives.geneId") == f.col("negatives.geneId")) - # to keep the positives that are outside the v2g dataset - | (f.col("negatives.geneId").isNull()), - f.lit("positive"), - ).otherwise("negative"), - ) - # Remove redundant loci by testing they are truly independent - .alias("left") - .join( - overlaps_df.alias("right"), - (f.col("left.variantId") == f.col("right.leftStudyLocusId")) - | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), - how="left", - ) - .distinct() - # filter out genes where geneIdA has goldStandardSet negative but geneIdA and gene IdB are interacting + @staticmethod + def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: + """Create full set of positive and negative evidence of locus to gene associations. + + Args: + positive_set (DataFrame): Positive set + v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS + + Returns: + DataFrame: Full set of positive and negative evidence of locus to gene associations + """ + return positive_set.join( + v2g.df.filter(f.col("distance") <= 500_000), + on="variantId", + how="left", + ).withColumn( + "goldStandardSet", + f.when( + (f.col("positives.geneId") == f.col("negatives.geneId")) + # to keep the positives that are outside the v2g dataset + | (f.col("negatives.geneId").isNull()), + f.lit("positive"), + ).otherwise("negative"), + ) + + @staticmethod + def remove_false_negatives( + full_set: DataFrame, interactions_df: DataFrame + ) -> DataFrame: + """Remove redundant loci by testing they are truly independent. + + Args: + full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives. + interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes + + Returns: + DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. + """ + return ( + full_set.alias("left") .join( interactions_df.alias("interactions"), (f.col("left.geneId") == f.col("interactions.geneIdA")) | (f.col("left.geneId") == f.col("interactions.geneIdB")), how="left", ) - .withColumn("interacting", (f.col("score") > 0.7)) + .withColumn("interacting", (f.col("score") > 0.7)) # remove hardcoded value .filter( ~( (f.col("goldStandardSet") == 0) @@ -136,9 +130,64 @@ def as_l2g_gold_standard( ) ) ) - .select("studyLocusId", "geneId", "goldStandardSet", "sources") ) + + @staticmethod + def remove_redundant_locus( + full_set: DataFrame, study_locus_overlap: StudyLocusOverlap + ) -> DataFrame: + """Remove redundant loci by testing they are truly independent. + + Args: + full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives. + study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci + + Returns: + DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. # TODO rename + """ + return ( + full_set.alias("left") + .join( + study_locus_overlap.df.select( + "leftStudyLocusId", "rightStudyLocusId" + ).alias("right"), + (f.col("left.variantId") == f.col("right.leftStudyLocusId")) + | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), + how="left", + ) + .distinct() + ) + + @classmethod + def as_l2g_gold_standard( + cls: type[OpenTargetsL2GGoldStandard], + gold_standard_curation: DataFrame, + v2g: V2G, + study_locus_overlap: StudyLocusOverlap, + interactions: DataFrame, + ) -> L2GGoldStandard: + """Initialise L2GGoldStandard from source dataset. + + Args: + gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards + v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS + study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci + interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene + + Returns: + L2GGoldStandard: L2G Gold Standard dataset + """ + interactions_df = cls.process_gene_interactions(interactions) + + positive_set = cls.create_positive_set(gold_standard_curation) + + full_set = cls.create_full_set(positive_set, v2g) + + final_set = full_set.transform( + cls.remove_redundant_locus, study_locus_overlap + ).transform(cls.remove_false_negatives, interactions_df) + return L2GGoldStandard( - _df=full_set, + _df=final_set, _schema=L2GGoldStandard.get_schema(), ) From f7eba79b6d466a962997f6e8306c469c7483cea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 09:43:31 +0000 Subject: [PATCH 04/25] refactor: move hardcoded values to constants --- .../open_targets/l2g_gold_standard.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index f5e2a0b25..9986339b5 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -19,6 +19,11 @@ class OpenTargetsL2GGoldStandard: - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS. """ + LOCUS_TO_GENE_WINDOW = 500_000 + GS_POSITIVE_LABEL = "positive" + GS_NEGATIVE_LABEL = "negative" + INTERACTION_THRESHOLD = 0.7 + @staticmethod def process_gene_interactions(interactions: DataFrame) -> DataFrame: """Extract top scoring gene-gene interaction from the interactions dataset of the Platform. @@ -85,7 +90,9 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: DataFrame: Full set of positive and negative evidence of locus to gene associations """ return positive_set.join( - v2g.df.filter(f.col("distance") <= 500_000), + v2g.df.filter( + f.col("distance") <= OpenTargetsL2GGoldStandard.LOCUS_TO_GENE_WINDOW + ), on="variantId", how="left", ).withColumn( @@ -94,8 +101,8 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: (f.col("positives.geneId") == f.col("negatives.geneId")) # to keep the positives that are outside the v2g dataset | (f.col("negatives.geneId").isNull()), - f.lit("positive"), - ).otherwise("negative"), + f.lit(OpenTargetsL2GGoldStandard.GS_POSITIVE_LABEL), + ).otherwise(OpenTargetsL2GGoldStandard.GS_NEGATIVE_LABEL), ) @staticmethod @@ -119,10 +126,15 @@ def remove_false_negatives( | (f.col("left.geneId") == f.col("interactions.geneIdB")), how="left", ) - .withColumn("interacting", (f.col("score") > 0.7)) # remove hardcoded value + .withColumn( + "interacting", + (f.col("score") > OpenTargetsL2GGoldStandard.INTERACTION_THRESHOLD), + ) .filter( ~( - (f.col("goldStandardSet") == 0) + ( + f.col("goldStandardSet") == 0 + ) # bugfix: goldStandardSet is a string, not an int & (f.col("interacting")) & ( (f.col("left.geneId") == f.col("interactions.geneIdA")) @@ -184,7 +196,9 @@ def as_l2g_gold_standard( full_set = cls.create_full_set(positive_set, v2g) final_set = full_set.transform( - cls.remove_redundant_locus, study_locus_overlap + # TODO: move logic to L2GGoldStandard + cls.remove_redundant_locus, + study_locus_overlap, ).transform(cls.remove_false_negatives, interactions_df) return L2GGoldStandard( From 65be4708f8b1dbfd24b6e5c6b34cd4098932b9f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 10:02:56 +0000 Subject: [PATCH 05/25] refactor: turn `OpenTargetsL2GGoldStandard` into class methods --- .../open_targets/l2g_gold_standard.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 9986339b5..2e3a7fe8d 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -1,6 +1,8 @@ """Parser for OTPlatform locus to gene gold standards curation.""" from __future__ import annotations +from typing import Type + import pyspark.sql.functions as f from pyspark.sql import DataFrame @@ -24,8 +26,10 @@ class OpenTargetsL2GGoldStandard: GS_NEGATIVE_LABEL = "negative" INTERACTION_THRESHOLD = 0.7 - @staticmethod - def process_gene_interactions(interactions: DataFrame) -> DataFrame: + @classmethod + def process_gene_interactions( + cls: Type[OpenTargetsL2GGoldStandard], interactions: DataFrame + ) -> DataFrame: """Extract top scoring gene-gene interaction from the interactions dataset of the Platform. Args: @@ -44,8 +48,10 @@ def process_gene_interactions(interactions: DataFrame) -> DataFrame: "scoring as score", ) - @staticmethod - def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame: + @classmethod + def create_positive_set( + cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame + ) -> DataFrame: """Parse positive set from gold standard curation. Args: @@ -78,8 +84,10 @@ def create_positive_set(gold_standard_curation: DataFrame) -> DataFrame: .agg(f.collect_set("source").alias("sources")) ) - @staticmethod - def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: + @classmethod + def create_full_set( + cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G + ) -> DataFrame: """Create full set of positive and negative evidence of locus to gene associations. Args: @@ -90,9 +98,7 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: DataFrame: Full set of positive and negative evidence of locus to gene associations """ return positive_set.join( - v2g.df.filter( - f.col("distance") <= OpenTargetsL2GGoldStandard.LOCUS_TO_GENE_WINDOW - ), + v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW), on="variantId", how="left", ).withColumn( @@ -101,13 +107,15 @@ def create_full_set(positive_set: DataFrame, v2g: V2G) -> DataFrame: (f.col("positives.geneId") == f.col("negatives.geneId")) # to keep the positives that are outside the v2g dataset | (f.col("negatives.geneId").isNull()), - f.lit(OpenTargetsL2GGoldStandard.GS_POSITIVE_LABEL), - ).otherwise(OpenTargetsL2GGoldStandard.GS_NEGATIVE_LABEL), + f.lit(cls.GS_POSITIVE_LABEL), + ).otherwise(cls.GS_NEGATIVE_LABEL), ) - @staticmethod + @classmethod def remove_false_negatives( - full_set: DataFrame, interactions_df: DataFrame + cls: Type[OpenTargetsL2GGoldStandard], + full_set: DataFrame, + interactions_df: DataFrame, ) -> DataFrame: """Remove redundant loci by testing they are truly independent. @@ -128,7 +136,7 @@ def remove_false_negatives( ) .withColumn( "interacting", - (f.col("score") > OpenTargetsL2GGoldStandard.INTERACTION_THRESHOLD), + (f.col("score") > cls.INTERACTION_THRESHOLD), ) .filter( ~( @@ -144,9 +152,11 @@ def remove_false_negatives( ) ) - @staticmethod + @classmethod def remove_redundant_locus( - full_set: DataFrame, study_locus_overlap: StudyLocusOverlap + cls: Type[OpenTargetsL2GGoldStandard], + full_set: DataFrame, + study_locus_overlap: StudyLocusOverlap, ) -> DataFrame: """Remove redundant loci by testing they are truly independent. From 1518156f4f44aa10d544e45ca9bb321ffc485d97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 10:58:32 +0000 Subject: [PATCH 06/25] refactor(gold_standard): move logic to refine gold standards to `L2GGoldStandard` --- src/otg/dataset/l2g_gold_standard.py | 104 ++++++++++++++- .../open_targets/l2g_gold_standard.py | 125 ++---------------- 2 files changed, 111 insertions(+), 118 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index 44470581f..b5f1189a9 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -2,9 +2,12 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Type + +import pyspark.sql.functions as f from otg.common.schemas import parse_spark_schema +from otg.common.spark_helpers import get_record_with_maximum_value from otg.dataset.dataset import Dataset if TYPE_CHECKING: @@ -19,6 +22,8 @@ class L2GGoldStandard(Dataset): """L2G gold standard dataset.""" + INTERACTION_THRESHOLD = 0.7 + @classmethod def from_otg_curation( cls: type[L2GGoldStandard], @@ -42,8 +47,34 @@ def from_otg_curation( OpenTargetsL2GGoldStandard, ) - return OpenTargetsL2GGoldStandard.as_l2g_gold_standard( - gold_standard_curation, v2g, study_locus_overlap, interactions + interactions_df = cls.process_gene_interactions(interactions) + + return ( + OpenTargetsL2GGoldStandard.as_l2g_gold_standard(gold_standard_curation, v2g) + .filter_unique_associations(study_locus_overlap) + .remove_false_negatives(interactions_df) + ) + + @classmethod + def process_gene_interactions( + cls: Type[L2GGoldStandard], interactions: DataFrame + ) -> DataFrame: + """Extract top scoring gene-gene interaction from the interactions dataset of the Platform. + + Args: + interactions (DataFrame): Gene-gene interactions dataset + + Returns: + DataFrame: Top scoring gene-gene interaction per pair of genes + """ + return get_record_with_maximum_value( + interactions, + ["targetA", "targetB"], + "scoring", + ).selectExpr( + "targetA as geneIdA", + "targetB as geneIdB", + "scoring as score", ) @classmethod @@ -54,3 +85,70 @@ def get_schema(cls: type[L2GGoldStandard]) -> StructType: StructType: Spark schema for the L2GGoldStandard dataset """ return parse_spark_schema("l2g_gold_standard.json") + + def filter_unique_associations( + self: L2GGoldStandard, + study_locus_overlap: StudyLocusOverlap, + ) -> L2GGoldStandard: + """Refines the gold standard to filter out loci that are not independent. redundant loci by testing they are truly independent. + + Args: + study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus. + + Returns: + L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives. + """ + # TODO: Test this logic + self.df = ( + self.df.alias("left") + .join( + study_locus_overlap.df.select( + "leftStudyLocusId", "rightStudyLocusId" + ).alias("right"), + (f.col("left.variantId") == f.col("right.leftStudyLocusId")) + | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), + how="left", + ) + .distinct() + ) + return self + + def remove_false_negatives( + self: L2GGoldStandard, + interactions_df: DataFrame, + ) -> L2GGoldStandard: + """Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene. + + Args: + interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes + + Returns: + L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding. + """ + # TODO: Test this logic + self.df = ( + self.df.alias("left") + .join( + interactions_df.alias("interactions"), + (f.col("left.geneId") == f.col("interactions.geneIdA")) + | (f.col("left.geneId") == f.col("interactions.geneIdB")), + how="left", + ) + .withColumn( + "interacting", + (f.col("score") > self.INTERACTION_THRESHOLD), + ) + .filter( + ~( + ( + f.col("goldStandardSet") == 0 + ) # TODO: goldStandardSet is a string, not an int + & (f.col("interacting")) + & ( + (f.col("left.geneId") == f.col("interactions.geneIdA")) + | (f.col("left.geneId") == f.col("interactions.geneIdB")) + ) + ) + ) + ) + return self diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 2e3a7fe8d..5b47468f7 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -6,10 +6,8 @@ import pyspark.sql.functions as f from pyspark.sql import DataFrame -from otg.common.spark_helpers import get_record_with_maximum_value from otg.dataset.l2g_gold_standard import L2GGoldStandard from otg.dataset.study_locus import StudyLocus -from otg.dataset.study_locus_overlap import StudyLocusOverlap from otg.dataset.v2g import V2G @@ -24,32 +22,9 @@ class OpenTargetsL2GGoldStandard: LOCUS_TO_GENE_WINDOW = 500_000 GS_POSITIVE_LABEL = "positive" GS_NEGATIVE_LABEL = "negative" - INTERACTION_THRESHOLD = 0.7 @classmethod - def process_gene_interactions( - cls: Type[OpenTargetsL2GGoldStandard], interactions: DataFrame - ) -> DataFrame: - """Extract top scoring gene-gene interaction from the interactions dataset of the Platform. - - Args: - interactions (DataFrame): Gene-gene interactions dataset - - Returns: - DataFrame: Top scoring gene-gene interaction per pair of genes - """ - return get_record_with_maximum_value( - interactions, - ["targetA", "targetB"], - "scoring", - ).selectExpr( - "targetA as geneIdA", - "targetB as geneIdB", - "scoring as score", - ) - - @classmethod - def create_positive_set( + def parse_positive_curation( cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame ) -> DataFrame: """Parse positive set from gold standard curation. @@ -85,18 +60,21 @@ def create_positive_set( ) @classmethod - def create_full_set( + def expand_gold_standard_with_negatives( cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G ) -> DataFrame: """Create full set of positive and negative evidence of locus to gene associations. + Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set. + Args: - positive_set (DataFrame): Positive set + positive_set (DataFrame): Positive set from curation v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS Returns: DataFrame: Full set of positive and negative evidence of locus to gene associations """ + # TODO: test function return positive_set.join( v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW), on="variantId", @@ -111,107 +89,24 @@ def create_full_set( ).otherwise(cls.GS_NEGATIVE_LABEL), ) - @classmethod - def remove_false_negatives( - cls: Type[OpenTargetsL2GGoldStandard], - full_set: DataFrame, - interactions_df: DataFrame, - ) -> DataFrame: - """Remove redundant loci by testing they are truly independent. - - Args: - full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives. - interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes - - Returns: - DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. - """ - return ( - full_set.alias("left") - .join( - interactions_df.alias("interactions"), - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")), - how="left", - ) - .withColumn( - "interacting", - (f.col("score") > cls.INTERACTION_THRESHOLD), - ) - .filter( - ~( - ( - f.col("goldStandardSet") == 0 - ) # bugfix: goldStandardSet is a string, not an int - & (f.col("interacting")) - & ( - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")) - ) - ) - ) - ) - - @classmethod - def remove_redundant_locus( - cls: Type[OpenTargetsL2GGoldStandard], - full_set: DataFrame, - study_locus_overlap: StudyLocusOverlap, - ) -> DataFrame: - """Remove redundant loci by testing they are truly independent. - - Args: - full_set (DataFrame): Full set of positive and negative evidence of locus to gene associations. These include false negatives. - study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci - - Returns: - DataFrame: Full set of positive and negative evidence of locus to gene associations. False negatives are removed. # TODO rename - """ - return ( - full_set.alias("left") - .join( - study_locus_overlap.df.select( - "leftStudyLocusId", "rightStudyLocusId" - ).alias("right"), - (f.col("left.variantId") == f.col("right.leftStudyLocusId")) - | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), - how="left", - ) - .distinct() - ) - @classmethod def as_l2g_gold_standard( cls: type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame, v2g: V2G, - study_locus_overlap: StudyLocusOverlap, - interactions: DataFrame, ) -> L2GGoldStandard: """Initialise L2GGoldStandard from source dataset. Args: gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS - study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci - interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene Returns: - L2GGoldStandard: L2G Gold Standard dataset + L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed. """ - interactions_df = cls.process_gene_interactions(interactions) - - positive_set = cls.create_positive_set(gold_standard_curation) - - full_set = cls.create_full_set(positive_set, v2g) - - final_set = full_set.transform( - # TODO: move logic to L2GGoldStandard - cls.remove_redundant_locus, - study_locus_overlap, - ).transform(cls.remove_false_negatives, interactions_df) - return L2GGoldStandard( - _df=final_set, + _df=cls.parse_positive_curation(gold_standard_curation).transform( + cls.expand_gold_standard_with_negatives, v2g + ), _schema=L2GGoldStandard.get_schema(), ) From ab29c9a70281186c947f5ce93b4c8bdd4ea4bc96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 12:18:12 +0000 Subject: [PATCH 07/25] test: add `test_parse_positive_curation` --- .../open_targets/test_l2g_gold_standard.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py index d3435e624..bced217e0 100644 --- a/tests/datasource/open_targets/test_l2g_gold_standard.py +++ b/tests/datasource/open_targets/test_l2g_gold_standard.py @@ -9,23 +9,27 @@ from otg.datasource.open_targets.l2g_gold_standard import OpenTargetsL2GGoldStandard if TYPE_CHECKING: - from otg.dataset.study_locus_overlap import StudyLocusOverlap from otg.dataset.v2g import V2G def test_open_targets_as_l2g_gold_standard( sample_l2g_gold_standard: DataFrame, mock_v2g: V2G, - mock_study_locus_overlap: StudyLocusOverlap, - sample_otp_interactions: DataFrame, ) -> None: """Test L2G gold standard from OTG curation.""" assert isinstance( OpenTargetsL2GGoldStandard.as_l2g_gold_standard( sample_l2g_gold_standard, mock_v2g, - mock_study_locus_overlap, - sample_otp_interactions, ), L2GGoldStandard, ) + + +def test_parse_positive_curation( + sample_l2g_gold_standard: DataFrame, +) -> None: + """Test parsing curation as the positive set.""" + expected_cols = ["studyLocusId", "studyId", "variantId", "geneId", "sources"] + df = OpenTargetsL2GGoldStandard.parse_positive_curation(sample_l2g_gold_standard) + assert df.columns == expected_cols, "GS parsing has a different schema." From dd95d9c51fec65f754d702a67301925fa7d97827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 13:03:32 +0000 Subject: [PATCH 08/25] test: fix and test logic in `expand_gold_standard_with_negatives` --- .../open_targets/l2g_gold_standard.py | 45 +++++++++++++------ .../open_targets/test_l2g_gold_standard.py | 36 ++++++++++++++- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 5b47468f7..0611023e1 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -75,18 +75,32 @@ def expand_gold_standard_with_negatives( DataFrame: Full set of positive and negative evidence of locus to gene associations """ # TODO: test function - return positive_set.join( - v2g.df.filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW), - on="variantId", - how="left", - ).withColumn( - "goldStandardSet", - f.when( - (f.col("positives.geneId") == f.col("negatives.geneId")) - # to keep the positives that are outside the v2g dataset - | (f.col("negatives.geneId").isNull()), - f.lit(cls.GS_POSITIVE_LABEL), - ).otherwise(cls.GS_NEGATIVE_LABEL), + return ( + positive_set.withColumnRenamed("geneId", "curated_geneId") + .join( + v2g.df.selectExpr( + "variantId", "geneId as non_curated_geneId", "distance" + ).filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW), + on="variantId", + how="left", + ) + .withColumn( + "goldStandardSet", + f.when( + (f.col("curated_geneId") == f.col("non_curated_geneId")) + # to keep the positives that are outside the v2g dataset + | (f.col("non_curated_geneId").isNull()), + f.lit(cls.GS_POSITIVE_LABEL), + ).otherwise(cls.GS_NEGATIVE_LABEL), + ) + .withColumn( + "geneId", + f.when( + f.col("goldStandardSet") == cls.GS_POSITIVE_LABEL, + f.col("curated_geneId"), + ).otherwise(f.col("non_curated_geneId")), + ) + .drop("distance", "curated_geneId", "non_curated_geneId") ) @classmethod @@ -105,8 +119,11 @@ def as_l2g_gold_standard( L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed. """ return L2GGoldStandard( - _df=cls.parse_positive_curation(gold_standard_curation).transform( - cls.expand_gold_standard_with_negatives, v2g + _df=cls.parse_positive_curation(gold_standard_curation) + .transform(cls.expand_gold_standard_with_negatives, v2g) + .drop( + "variantId", + "studyId", ), _schema=L2GGoldStandard.get_schema(), ) diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py index bced217e0..7f075f865 100644 --- a/tests/datasource/open_targets/test_l2g_gold_standard.py +++ b/tests/datasource/open_targets/test_l2g_gold_standard.py @@ -6,10 +6,11 @@ from pyspark.sql import DataFrame from otg.dataset.l2g_gold_standard import L2GGoldStandard +from otg.dataset.v2g import V2G from otg.datasource.open_targets.l2g_gold_standard import OpenTargetsL2GGoldStandard if TYPE_CHECKING: - from otg.dataset.v2g import V2G + from pyspark.sql.session import SparkSession def test_open_targets_as_l2g_gold_standard( @@ -33,3 +34,36 @@ def test_parse_positive_curation( expected_cols = ["studyLocusId", "studyId", "variantId", "geneId", "sources"] df = OpenTargetsL2GGoldStandard.parse_positive_curation(sample_l2g_gold_standard) assert df.columns == expected_cols, "GS parsing has a different schema." + + +def test_expand_gold_standard_with_negatives(spark: SparkSession) -> None: + """Test expanding positive set with negative set.""" + sample_positive_set = spark.createDataFrame( + [ + ("variant1", "gene1", "study1"), + ("variant2", "gene2", "study1"), + ], + ["variantId", "geneId", "studyId"], + ) + sample_v2g_df = spark.createDataFrame( + [ + ("variant1", "gene1", 5, "X", "X", "X"), + ("variant1", "gene3", 10, "X", "X", "X"), + ], + ["variantId", "geneId", "distance", "chromosome", "datatypeId", "datasourceId"], + ) + + expected_expanded_gs = spark.createDataFrame( + [ + ("variant1", "study1", "negative", "gene3"), + ("variant1", "study1", "positive", "gene1"), + ("variant2", "study1", "positive", "gene2"), + ], + ["variantId", "geneId", "goldStandardSet", "studyId"], + ) + observed_df = OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives( + sample_positive_set, V2G(_df=sample_v2g_df, _schema=V2G.get_schema()) + ) + assert ( + observed_df.collect() == expected_expanded_gs.collect() + ), "GS expansion is not as expected." From 8347b2f896d5a808a57d924050cd7f498eb30a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 15:09:10 +0000 Subject: [PATCH 09/25] test: add `test_expand_gold_standard_with_negatives_same_positives` --- .../open_targets/test_l2g_gold_standard.py | 92 +++++++++++++------ 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/tests/datasource/open_targets/test_l2g_gold_standard.py b/tests/datasource/open_targets/test_l2g_gold_standard.py index 7f075f865..0ae3e0a08 100644 --- a/tests/datasource/open_targets/test_l2g_gold_standard.py +++ b/tests/datasource/open_targets/test_l2g_gold_standard.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING +import pytest from pyspark.sql import DataFrame from otg.dataset.l2g_gold_standard import L2GGoldStandard @@ -36,34 +37,67 @@ def test_parse_positive_curation( assert df.columns == expected_cols, "GS parsing has a different schema." -def test_expand_gold_standard_with_negatives(spark: SparkSession) -> None: +class TestExpandGoldStandardWithNegatives: """Test expanding positive set with negative set.""" - sample_positive_set = spark.createDataFrame( - [ - ("variant1", "gene1", "study1"), - ("variant2", "gene2", "study1"), - ], - ["variantId", "geneId", "studyId"], - ) - sample_v2g_df = spark.createDataFrame( - [ - ("variant1", "gene1", 5, "X", "X", "X"), - ("variant1", "gene3", 10, "X", "X", "X"), - ], - ["variantId", "geneId", "distance", "chromosome", "datatypeId", "datasourceId"], - ) - expected_expanded_gs = spark.createDataFrame( - [ - ("variant1", "study1", "negative", "gene3"), - ("variant1", "study1", "positive", "gene1"), - ("variant2", "study1", "positive", "gene2"), - ], - ["variantId", "geneId", "goldStandardSet", "studyId"], - ) - observed_df = OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives( - sample_positive_set, V2G(_df=sample_v2g_df, _schema=V2G.get_schema()) - ) - assert ( - observed_df.collect() == expected_expanded_gs.collect() - ), "GS expansion is not as expected." + observed_df: DataFrame + expected_expanded_gs: DataFrame + sample_positive_set: DataFrame + + def test_expand_gold_standard_with_negatives_logic( + self: TestExpandGoldStandardWithNegatives, spark: SparkSession + ) -> None: + """Test expanding positive set with negative set coincides with expected results.""" + assert ( + self.observed_df.collect() == self.expected_expanded_gs.collect() + ), "GS expansion is not as expected." + + def test_expand_gold_standard_with_negatives_same_positives( + self: TestExpandGoldStandardWithNegatives, spark: SparkSession + ) -> None: + """Test expanding positive set with negative set doesn't remove any positives.""" + assert ( + self.observed_df.filter("goldStandardSet == 'positive'").count() + == self.sample_positive_set.count() + ), "GS expansion has removed positives." + + @pytest.fixture(autouse=True) + def _setup(self: TestExpandGoldStandardWithNegatives, spark: SparkSession) -> None: + """Prepare fixtures for TestExpandGoldStandardWithNegatives.""" + self.sample_positive_set = spark.createDataFrame( + [ + ("variant1", "gene1", "study1"), + ("variant2", "gene2", "study1"), + ], + ["variantId", "geneId", "studyId"], + ) + + sample_v2g_df = spark.createDataFrame( + [ + ("variant1", "gene1", 5, "X", "X", "X"), + ("variant1", "gene3", 10, "X", "X", "X"), + ], + [ + "variantId", + "geneId", + "distance", + "chromosome", + "datatypeId", + "datasourceId", + ], + ) + + self.expected_expanded_gs = spark.createDataFrame( + [ + ("variant1", "study1", "negative", "gene3"), + ("variant1", "study1", "positive", "gene1"), + ("variant2", "study1", "positive", "gene2"), + ], + ["variantId", "geneId", "goldStandardSet", "studyId"], + ) + self.observed_df = ( + OpenTargetsL2GGoldStandard.expand_gold_standard_with_negatives( + self.sample_positive_set, + V2G(_df=sample_v2g_df, _schema=V2G.get_schema()), + ) + ) From ca94412df9a0ebc950b3b10cb0b32c403eacc0d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Fri, 17 Nov 2023 15:40:56 +0000 Subject: [PATCH 10/25] test: testing for `process_gene_interactions` --- src/otg/dataset/l2g_gold_standard.py | 31 +++++++++++++------ .../open_targets/l2g_gold_standard.py | 1 - tests/dataset/test_l2g.py | 16 +++++++++- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index b5f1189a9..2a68762a8 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -55,6 +55,15 @@ def from_otg_curation( .remove_false_negatives(interactions_df) ) + @classmethod + def get_schema(cls: type[L2GGoldStandard]) -> StructType: + """Provides the schema for the L2GGoldStandard dataset. + + Returns: + StructType: Spark schema for the L2GGoldStandard dataset + """ + return parse_spark_schema("l2g_gold_standard.json") + @classmethod def process_gene_interactions( cls: Type[L2GGoldStandard], interactions: DataFrame @@ -62,10 +71,21 @@ def process_gene_interactions( """Extract top scoring gene-gene interaction from the interactions dataset of the Platform. Args: - interactions (DataFrame): Gene-gene interactions dataset + interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform Returns: DataFrame: Top scoring gene-gene interaction per pair of genes + + Examples: + >>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"]) + >>> L2GGoldStandard.process_gene_interactions(interactions).show() + +-------+-------+-----+ + |geneIdA|geneIdB|score| + +-------+-------+-----+ + | gene1| gene2| 0.8| + | gene2| gene3| 0.7| + +-------+-------+-----+ + """ return get_record_with_maximum_value( interactions, @@ -77,15 +97,6 @@ def process_gene_interactions( "scoring as score", ) - @classmethod - def get_schema(cls: type[L2GGoldStandard]) -> StructType: - """Provides the schema for the L2GGoldStandard dataset. - - Returns: - StructType: Spark schema for the L2GGoldStandard dataset - """ - return parse_spark_schema("l2g_gold_standard.json") - def filter_unique_associations( self: L2GGoldStandard, study_locus_overlap: StudyLocusOverlap, diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 0611023e1..cc8f4b710 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -74,7 +74,6 @@ def expand_gold_standard_with_negatives( Returns: DataFrame: Full set of positive and negative evidence of locus to gene associations """ - # TODO: test function return ( positive_set.withColumnRenamed("geneId", "curated_geneId") .join( diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index 140a26c01..2ed278e07 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -1,10 +1,15 @@ -"""Tests on LD index.""" +"""Tests on L2G datasets.""" from __future__ import annotations +from typing import TYPE_CHECKING + from otg.dataset.l2g_feature_matrix import L2GFeatureMatrix from otg.dataset.l2g_gold_standard import L2GGoldStandard from otg.dataset.l2g_prediction import L2GPrediction +if TYPE_CHECKING: + from pyspark.sql import DataFrame + def test_feature_matrix(mock_l2g_feature_matrix: L2GFeatureMatrix) -> None: """Test L2G Feature Matrix creation with mock data.""" @@ -16,6 +21,15 @@ def test_gold_standard(mock_l2g_gold_standard: L2GFeatureMatrix) -> None: assert isinstance(mock_l2g_gold_standard, L2GGoldStandard) +def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None: + """Tests processing of gene interactions from OTP.""" + expected_cols = ["geneIdA", "geneIdB", "score"] + observed_df = L2GGoldStandard.process_gene_interactions(sample_otp_interactions) + assert ( + observed_df.columns == expected_cols + ), "Gene interactions has a different schema." + + def test_predictions(mock_l2g_predictions: L2GFeatureMatrix) -> None: """Test L2G predictions creation with mock data.""" assert isinstance(mock_l2g_predictions, L2GPrediction) From 6a339761e80361ee3aa5a8ae1fb40bf785b665cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 12:32:43 +0000 Subject: [PATCH 11/25] chore: add `variantId` to gold standards schema --- src/otg/assets/schemas/l2g_gold_standard.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json index 98e3e906b..d9d79ce09 100644 --- a/src/otg/assets/schemas/l2g_gold_standard.json +++ b/src/otg/assets/schemas/l2g_gold_standard.json @@ -7,6 +7,12 @@ "nullable": false, "metadata": {} }, + { + "name": "variantId", + "type": "string", + "nullable": false, + "metadata": {} + }, { "name": "geneId", "type": "string", From c75a6634bf55113c3bff15fbab2d169778c07eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 12:34:37 +0000 Subject: [PATCH 12/25] chore: change `sources` in gold standards schema to a nullable --- src/otg/assets/schemas/l2g_gold_standard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otg/assets/schemas/l2g_gold_standard.json b/src/otg/assets/schemas/l2g_gold_standard.json index d9d79ce09..ba494b9aa 100644 --- a/src/otg/assets/schemas/l2g_gold_standard.json +++ b/src/otg/assets/schemas/l2g_gold_standard.json @@ -28,7 +28,7 @@ { "metadata": {}, "name": "sources", - "nullable": false, + "nullable": true, "type": { "containsNull": true, "elementType": "string", From 80077267bac0b948e8618581bc3dfa10b39388cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 13:01:02 +0000 Subject: [PATCH 13/25] test: add `test_filter_unique_associations` --- src/otg/dataset/l2g_gold_standard.py | 3 +- tests/dataset/test_l2g.py | 56 +++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index 2a68762a8..59f437606 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -109,7 +109,7 @@ def filter_unique_associations( Returns: L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives. """ - # TODO: Test this logic + cols_to_keep = self.df.columns self.df = ( self.df.alias("left") .join( @@ -120,6 +120,7 @@ def filter_unique_associations( | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), how="left", ) + .select(*cols_to_keep) .distinct() ) return self diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index 2ed278e07..fc93f6cad 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -6,9 +6,10 @@ from otg.dataset.l2g_feature_matrix import L2GFeatureMatrix from otg.dataset.l2g_gold_standard import L2GGoldStandard from otg.dataset.l2g_prediction import L2GPrediction +from otg.dataset.study_locus_overlap import StudyLocusOverlap if TYPE_CHECKING: - from pyspark.sql import DataFrame + from pyspark.sql import DataFrame, SparkSession def test_feature_matrix(mock_l2g_feature_matrix: L2GFeatureMatrix) -> None: @@ -33,3 +34,56 @@ def test_process_gene_interactions(sample_otp_interactions: DataFrame) -> None: def test_predictions(mock_l2g_predictions: L2GFeatureMatrix) -> None: """Test L2G predictions creation with mock data.""" assert isinstance(mock_l2g_predictions, L2GPrediction) + + +def test_filter_unique_associations(spark: SparkSession) -> None: + """Test filter_unique_associations.""" + mock_l2g_gs_df = spark.createDataFrame( + [ + (1, "variant1", "gene1", "positive"), + ( + 2, + "variant2", + "gene1", + "negative", + ), # in the same locus as sl1 and pointing to same gene, has to be dropped + ( + 3, + "variant3", + "gene1", + "positive", + ), # in diff locus as sl1 and pointing to same gene, has to be kept + ( + 4, + "variant4", + "gene2", + "positive", + ), # in same locus as sl1 and pointing to diff gene, has to be kept + ], + "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + ) + + mock_sl_overlap_df = spark.createDataFrame( + [(1, 2, "variant2"), (1, 4, "variant4")], + "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + ) + + expected_df = spark.createDataFrame( + [ + (1, "variant1", "gene1", "positive"), + (3, "variant3", "gene1", "positive"), + (4, "variant4", "gene2", "positive"), + ], + "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + ) + + mock_l2g_gs = L2GGoldStandard( + _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema() + ) + mock_sl_overlap = StudyLocusOverlap( + _df=mock_sl_overlap_df, _schema=StudyLocusOverlap.get_schema() + ) + + observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df + + assert observed_df.collect() == expected_df.collect() From 9c0a042306f0e56ac42d1c67f354e65c6c252500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 17:01:19 +0000 Subject: [PATCH 14/25] feat(overlaps): add and test method to transform the overlaps as a square matrix --- src/otg/dataset/study_locus_overlap.py | 17 +++++++++++++++ tests/dataset/test_study_locus_overlap.py | 26 +++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/src/otg/dataset/study_locus_overlap.py b/src/otg/dataset/study_locus_overlap.py index 5902f613f..d0730d723 100644 --- a/src/otg/dataset/study_locus_overlap.py +++ b/src/otg/dataset/study_locus_overlap.py @@ -47,3 +47,20 @@ def from_associations( StudyLocusOverlap: Study-locus overlap dataset """ return study_locus.find_overlaps(study_index) + + def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap: + """Convert the dataset to a square matrix. + + Returns: + StudyLocusOverlap: Square matrix of the dataset + """ + return StudyLocusOverlap( + _df=self.df.unionByName( + self.df.selectExpr( + "leftStudyLocusId as rightStudyLocusId", + "rightStudyLocusId as leftStudyLocusId", + "tagVariantId", + ) + ).distinct(), + _schema=self.get_schema(), + ) diff --git a/tests/dataset/test_study_locus_overlap.py b/tests/dataset/test_study_locus_overlap.py index b16311d6c..23c3a4e65 100644 --- a/tests/dataset/test_study_locus_overlap.py +++ b/tests/dataset/test_study_locus_overlap.py @@ -1,6 +1,8 @@ """Test study locus overlap dataset.""" from __future__ import annotations +from pyspark.sql import SparkSession + from otg.dataset.study_locus_overlap import StudyLocusOverlap @@ -9,3 +11,27 @@ def test_study_locus_overlap_creation( ) -> None: """Test study locus overlap creation with mock data.""" assert isinstance(mock_study_locus_overlap, StudyLocusOverlap) + + +def test_convert_to_square_matrix(spark: SparkSession) -> None: + """Test _convert_to_square_matrix.""" + mock_sl_overlap = StudyLocusOverlap( + _df=spark.createDataFrame( + [ + (1, 2, "variant2"), + ], + "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + ), + _schema=StudyLocusOverlap.get_schema(), + ) + + expected_df = spark.createDataFrame( + [ + (1, 2, "variant2"), + (2, 1, "variant2"), + ], + "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + ) + observed_df = mock_sl_overlap._convert_to_square_matrix().df + + assert observed_df.collect() == expected_df.collect() From dc7c423d6f668f7c8b8d4a072a85058beaf21a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 17:02:08 +0000 Subject: [PATCH 15/25] chore(overlaps): chromosome and statistics are not mandatory fields in the schema --- src/otg/assets/schemas/study_locus_overlap.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/otg/assets/schemas/study_locus_overlap.json b/src/otg/assets/schemas/study_locus_overlap.json index 962a1186d..103321f79 100644 --- a/src/otg/assets/schemas/study_locus_overlap.json +++ b/src/otg/assets/schemas/study_locus_overlap.json @@ -15,7 +15,7 @@ { "metadata": {}, "name": "chromosome", - "nullable": false, + "nullable": true, "type": "string" }, { @@ -27,7 +27,7 @@ { "metadata": {}, "name": "statistics", - "nullable": false, + "nullable": true, "type": { "fields": [ { From 28031b8a14dfad0ef91d7160d138567a7ead0e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 17:05:35 +0000 Subject: [PATCH 16/25] feat(l2g_gold_standard): change `filter_unique_associations` logic --- src/otg/dataset/l2g_gold_standard.py | 38 ++++++++++++++++++++-------- tests/dataset/test_l2g.py | 2 +- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index 59f437606..f838f0173 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Type import pyspark.sql.functions as f +from pyspark.sql import Window from otg.common.schemas import parse_spark_schema from otg.common.spark_helpers import get_record_with_maximum_value @@ -101,7 +102,12 @@ def filter_unique_associations( self: L2GGoldStandard, study_locus_overlap: StudyLocusOverlap, ) -> L2GGoldStandard: - """Refines the gold standard to filter out loci that are not independent. redundant loci by testing they are truly independent. + """Refines the gold standard to filter out loci that are not independent. + + Rules: + - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one. + - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one. + - If two loci point to different genes, and have overlapping variants, we keep both. Args: study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus. @@ -109,21 +115,33 @@ def filter_unique_associations( Returns: L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives. """ + squared_overlaps = study_locus_overlap._convert_to_square_matrix() cols_to_keep = self.df.columns - self.df = ( + unique_associations = ( self.df.alias("left") + # identify all the study loci that point to the same gene + .withColumn( + "sl_same_gene", + f.collect_set("studyLocusId").over(Window.partitionBy("geneId")), + ) + # identify all the study loci that have an overlapping variant .join( - study_locus_overlap.df.select( - "leftStudyLocusId", "rightStudyLocusId" - ).alias("right"), - (f.col("left.variantId") == f.col("right.leftStudyLocusId")) - | (f.col("left.variantId") == f.col("right.rightStudyLocusId")), - how="left", + squared_overlaps.df.alias("right"), + (f.col("left.studyLocusId") == f.col("right.leftStudyLocusId")) + & (f.col("left.variantId") == f.col("right.tagVariantId")), + "left", + ) + .withColumn( + "overlaps", + f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise( + f.lit(False) + ), ) + # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus + .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1))) .select(*cols_to_keep) - .distinct() ) - return self + return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema()) def remove_false_negatives( self: L2GGoldStandard, diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index fc93f6cad..3ca74ecab 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -82,7 +82,7 @@ def test_filter_unique_associations(spark: SparkSession) -> None: ) mock_sl_overlap = StudyLocusOverlap( _df=mock_sl_overlap_df, _schema=StudyLocusOverlap.get_schema() - ) + )._convert_to_square_matrix() observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df From aa4246ccea29023e7022663eaec338eeb7a57f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 20 Nov 2023 17:23:32 +0000 Subject: [PATCH 17/25] test(l2g_gold_standard): add `test_remove_false_negatives` --- tests/dataset/test_l2g.py | 54 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index 3ca74ecab..12c80d874 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -87,3 +87,57 @@ def test_filter_unique_associations(spark: SparkSession) -> None: observed_df = mock_l2g_gs.filter_unique_associations(mock_sl_overlap).df assert observed_df.collect() == expected_df.collect() + + +def test_remove_false_negatives(spark: SparkSession) -> None: + """Test `remove_false_negatives`.""" + mock_l2g_gs_df = spark.createDataFrame( + [ + (1, "variant1", "gene1", "positive"), + ( + 2, + "variant2", + "gene2", + "negative", + ), # gene2 is a partner of gene1, has to be dropped + ( + 3, + "variant3", + "gene3", + "negative", + ), # gene 3 is not a partner of gene1, has to be kept + ( + 4, + "variant4", + "gene4", + "positive", + ), # gene 4 is a partner of gene1, has to be kept because it's positive + ], + "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + ) + + mock_interactions_df = spark.createDataFrame( + [ + ("gene1", "gene2", 0.8), + ("gene1", "gene3", 0.5), + ("gene1", "gene4", 0.8), + ], + "geneIdA STRING, geneIdB STRING, score DOUBLE", + ) + + expected_df = spark.createDataFrame( + [ + (1, "variant1", "gene1", "positive"), + (3, "variant3", "gene3", "negative"), + (4, "variant4", "gene4", "positive"), + ], + "studyLocusId LONG, variantId STRING, geneId STRING, goldStandardSet STRING", + ) + + mock_l2g_gs = L2GGoldStandard( + _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema() + ) + + observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df + + assert observed_df.collect() == expected_df.collect() From 0a1ffa0b75f9034ed02a6a333969d037e3e171c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 21 Nov 2023 00:28:25 +0000 Subject: [PATCH 18/25] fix(l2g_gold_standard): fix logic in `remove_false_negatives` --- src/otg/dataset/l2g_gold_standard.py | 57 ++++++++++++------- .../open_targets/l2g_gold_standard.py | 1 - tests/dataset/test_l2g.py | 4 +- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index f838f0173..65ba5859b 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -116,7 +116,6 @@ def filter_unique_associations( L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives. """ squared_overlaps = study_locus_overlap._convert_to_square_matrix() - cols_to_keep = self.df.columns unique_associations = ( self.df.alias("left") # identify all the study loci that point to the same gene @@ -139,7 +138,7 @@ def filter_unique_associations( ) # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1))) - .select(*cols_to_keep) + .select(*self.df.columns) ) return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema()) @@ -155,30 +154,44 @@ def remove_false_negatives( Returns: L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding. """ - # TODO: Test this logic - self.df = ( + squared_interactions = interactions_df.unionByName( + interactions_df.selectExpr( + "geneIdB as geneIdA", "geneIdA as geneIdB", "score" + ) + ).filter(f.col("score") > self.INTERACTION_THRESHOLD) + df = ( self.df.alias("left") .join( - interactions_df.alias("interactions"), - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")), - how="left", + # bring gene partners + squared_interactions.alias("right"), + f.col("left.geneId") == f.col("right.geneIdA"), + "left", ) - .withColumn( - "interacting", - (f.col("score") > self.INTERACTION_THRESHOLD), + .withColumnRenamed("geneIdB", "interactorGeneId") + .join( + # bring gold standard status for gene partners + self.df.selectExpr( + "geneId as interactorGeneId", + "goldStandardSet as interactorGeneIdGoldStandardSet", + ), + "interactorGeneId", + "left", + ) + # remove self-interactions + .filter( + (f.col("geneId") != f.col("interactorGeneId")) + | (f.col("interactorGeneId").isNull()) ) + # remove false negatives .filter( - ~( - ( - f.col("goldStandardSet") == 0 - ) # TODO: goldStandardSet is a string, not an int - & (f.col("interacting")) - & ( - (f.col("left.geneId") == f.col("interactions.geneIdA")) - | (f.col("left.geneId") == f.col("interactions.geneIdB")) - ) - ) + # drop rows where the GS gene is negative but the interactor is a GS positive + ~(f.col("goldStandardSet") == "negative") + & (f.col("interactorGeneIdGoldStandardSet") == "positive") + | + # keep rows where the gene does not interact + (f.col("interactorGeneId").isNull()) ) + .select(*self.df.columns) + .distinct() ) - return self + return L2GGoldStandard(_df=df, _schema=self.get_schema()) diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index cc8f4b710..0d95ecbe8 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -121,7 +121,6 @@ def as_l2g_gold_standard( _df=cls.parse_positive_curation(gold_standard_curation) .transform(cls.expand_gold_standard_with_negatives, v2g) .drop( - "variantId", "studyId", ), _schema=L2GGoldStandard.get_schema(), diff --git a/tests/dataset/test_l2g.py b/tests/dataset/test_l2g.py index 12c80d874..eb42d01c9 100644 --- a/tests/dataset/test_l2g.py +++ b/tests/dataset/test_l2g.py @@ -138,6 +138,8 @@ def test_remove_false_negatives(spark: SparkSession) -> None: _df=mock_l2g_gs_df, _schema=L2GGoldStandard.get_schema() ) - observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df + observed_df = mock_l2g_gs.remove_false_negatives(mock_interactions_df).df.orderBy( + "studyLocusId" + ) assert observed_df.collect() == expected_df.collect() From 2f13b3b76015f610a2a6d2b6cf5b5bf715da6dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 21 Nov 2023 00:39:28 +0000 Subject: [PATCH 19/25] chore(gold_standards): define gs labels as `L2GGoldStandard` attributes --- src/otg/dataset/l2g_gold_standard.py | 2 ++ src/otg/datasource/open_targets/l2g_gold_standard.py | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/otg/dataset/l2g_gold_standard.py b/src/otg/dataset/l2g_gold_standard.py index 65ba5859b..c6c0b89b9 100644 --- a/src/otg/dataset/l2g_gold_standard.py +++ b/src/otg/dataset/l2g_gold_standard.py @@ -24,6 +24,8 @@ class L2GGoldStandard(Dataset): """L2G gold standard dataset.""" INTERACTION_THRESHOLD = 0.7 + GS_POSITIVE_LABEL = "positive" + GS_NEGATIVE_LABEL = "negative" @classmethod def from_otg_curation( diff --git a/src/otg/datasource/open_targets/l2g_gold_standard.py b/src/otg/datasource/open_targets/l2g_gold_standard.py index 0d95ecbe8..532e382fe 100644 --- a/src/otg/datasource/open_targets/l2g_gold_standard.py +++ b/src/otg/datasource/open_targets/l2g_gold_standard.py @@ -20,8 +20,6 @@ class OpenTargetsL2GGoldStandard: """ LOCUS_TO_GENE_WINDOW = 500_000 - GS_POSITIVE_LABEL = "positive" - GS_NEGATIVE_LABEL = "negative" @classmethod def parse_positive_curation( @@ -89,13 +87,13 @@ def expand_gold_standard_with_negatives( (f.col("curated_geneId") == f.col("non_curated_geneId")) # to keep the positives that are outside the v2g dataset | (f.col("non_curated_geneId").isNull()), - f.lit(cls.GS_POSITIVE_LABEL), - ).otherwise(cls.GS_NEGATIVE_LABEL), + f.lit(L2GGoldStandard.GS_POSITIVE_LABEL), + ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL), ) .withColumn( "geneId", f.when( - f.col("goldStandardSet") == cls.GS_POSITIVE_LABEL, + f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL, f.col("curated_geneId"), ).otherwise(f.col("non_curated_geneId")), ) From 5f7d928451beaf8734442ac882df752fa40e8749 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 09:42:56 +0000 Subject: [PATCH 20/25] build(deps): bump pyarrow from 11.0.0 to 14.0.1 Bumps [pyarrow](https://github.com/apache/arrow) from 11.0.0 to 14.0.1. - [Commits](https://github.com/apache/arrow/compare/go/v11.0.0...go/v14.0.1) --- updated-dependencies: - dependency-name: pyarrow dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- poetry.lock | 67 +++++++++++++++++++++++++++++--------------------- pyproject.toml | 2 +- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/poetry.lock b/poetry.lock index e11bdf074..32f676f53 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6010,36 +6010,47 @@ files = [ [[package]] name = "pyarrow" -version = "11.0.0" +version = "14.0.1" description = "Python library for Apache Arrow" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pyarrow-11.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:40bb42afa1053c35c749befbe72f6429b7b5f45710e85059cdd534553ebcf4f2"}, - {file = "pyarrow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7c28b5f248e08dea3b3e0c828b91945f431f4202f1a9fe84d1012a761324e1ba"}, - {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a37bc81f6c9435da3c9c1e767324ac3064ffbe110c4e460660c43e144be4ed85"}, - {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7c53def8dbbc810282ad308cc46a523ec81e653e60a91c609c2233ae407689"}, - {file = "pyarrow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:25aa11c443b934078bfd60ed63e4e2d42461682b5ac10f67275ea21e60e6042c"}, - {file = "pyarrow-11.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e217d001e6389b20a6759392a5ec49d670757af80101ee6b5f2c8ff0172e02ca"}, - {file = "pyarrow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ad42bb24fc44c48f74f0d8c72a9af16ba9a01a2ccda5739a517aa860fa7e3d56"}, - {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d942c690ff24a08b07cb3df818f542a90e4d359381fbff71b8f2aea5bf58841"}, - {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f010ce497ca1b0f17a8243df3048055c0d18dcadbcc70895d5baf8921f753de5"}, - {file = "pyarrow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2f51dc7ca940fdf17893227edb46b6784d37522ce08d21afc56466898cb213b2"}, - {file = "pyarrow-11.0.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:1cbcfcbb0e74b4d94f0b7dde447b835a01bc1d16510edb8bb7d6224b9bf5bafc"}, - {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaee8f79d2a120bf3e032d6d64ad20b3af6f56241b0ffc38d201aebfee879d00"}, - {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:410624da0708c37e6a27eba321a72f29d277091c8f8d23f72c92bada4092eb5e"}, - {file = "pyarrow-11.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2d53ba72917fdb71e3584ffc23ee4fcc487218f8ff29dd6df3a34c5c48fe8c06"}, - {file = "pyarrow-11.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f12932e5a6feb5c58192209af1d2607d488cb1d404fbc038ac12ada60327fa34"}, - {file = "pyarrow-11.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:41a1451dd895c0b2964b83d91019e46f15b5564c7ecd5dcb812dadd3f05acc97"}, - {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:becc2344be80e5dce4e1b80b7c650d2fc2061b9eb339045035a1baa34d5b8f1c"}, - {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f40be0d7381112a398b93c45a7e69f60261e7b0269cc324e9f739ce272f4f70"}, - {file = "pyarrow-11.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:362a7c881b32dc6b0eccf83411a97acba2774c10edcec715ccaab5ebf3bb0835"}, - {file = "pyarrow-11.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:ccbf29a0dadfcdd97632b4f7cca20a966bb552853ba254e874c66934931b9841"}, - {file = "pyarrow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e99be85973592051e46412accea31828da324531a060bd4585046a74ba45854"}, - {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69309be84dcc36422574d19c7d3a30a7ea43804f12552356d1ab2a82a713c418"}, - {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da93340fbf6f4e2a62815064383605b7ffa3e9eeb320ec839995b1660d69f89b"}, - {file = "pyarrow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:caad867121f182d0d3e1a0d36f197df604655d0b466f1bc9bafa903aa95083e4"}, - {file = "pyarrow-11.0.0.tar.gz", hash = "sha256:5461c57dbdb211a632a48facb9b39bbeb8a7905ec95d768078525283caef5f6d"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, ] [package.dependencies] @@ -8391,4 +8402,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "55087f647f35a1d78c6384fe55eaa084aad18debff119f164f4f14707e7465fc" +content-hash = "8197d06ec721972d642dd466b4de4fed93c9b14a279749ee7eb4f91857f5fce3" diff --git a/pyproject.toml b/pyproject.toml index 38e91874a..6ca883855 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ xgboost = "^1.7.3" scikit-learn = "^1.2.1" numpy = "^1.26.1" hail = "0.2.126" -pyarrow = "^11.0.0" +pyarrow = "^14.0.1" wandb = "^0.16.0" [tool.poetry.dev-dependencies] From 4e4e4f5d45af2574d4e4af38d0924896166182da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Mon, 27 Nov 2023 10:46:29 +0100 Subject: [PATCH 21/25] chore: rename study_locus to credible_set for l2g --- config/step/locus_to_gene.yaml | 2 +- src/otg/dataset/l2g_prediction.py | 2 +- src/otg/l2g.py | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/step/locus_to_gene.yaml b/config/step/locus_to_gene.yaml index c05301049..dd4e018fb 100644 --- a/config/step/locus_to_gene.yaml +++ b/config/step/locus_to_gene.yaml @@ -8,7 +8,7 @@ wandb_run_name: null perform_cross_validation: false model_path: ${datasets.l2g_model} predictions_path: ${datasets.l2g_predictions} -study_locus_path: ${datasets.credible_set} +credible_set_path: ${datasets.credible_set} variant_gene_path: ${datasets.v2g} colocalisation_path: ${datasets.colocalisation} study_index_path: ${datasets.catalog_study_index} diff --git a/src/otg/dataset/l2g_prediction.py b/src/otg/dataset/l2g_prediction.py index ce4e34144..a588818cd 100644 --- a/src/otg/dataset/l2g_prediction.py +++ b/src/otg/dataset/l2g_prediction.py @@ -41,7 +41,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType: return parse_spark_schema("l2g_predictions.json") @classmethod - def from_study_locus( + def from_credible_set( cls: Type[L2GPrediction], model_path: str, study_locus: StudyLocus, diff --git a/src/otg/l2g.py b/src/otg/l2g.py index df0f886e9..95d477dc6 100644 --- a/src/otg/l2g.py +++ b/src/otg/l2g.py @@ -33,7 +33,7 @@ class LocusToGeneStep: perform_cross_validation (bool): Whether to perform cross validation. model_path (str | None): Path to save the model. predictions_path (str | None): Path to save the predictions. - study_locus_path (str): Path to study locus Parquet files. + credible_set_path (str): Path to credible set Parquet files. variant_gene_path (str): Path to variant to gene Parquet files. colocalisation_path (str): Path to colocalisation Parquet files. study_index_path (str): Path to study index Parquet files. @@ -52,7 +52,7 @@ class LocusToGeneStep: perform_cross_validation: bool = False model_path: str = MISSING predictions_path: str = MISSING - study_locus_path: str = MISSING + credible_set_path: str = MISSING variant_gene_path: str = MISSING colocalisation_path: str = MISSING study_index_path: str = MISSING @@ -109,8 +109,8 @@ def __post_init__(self: LocusToGeneStep) -> None: f"run_mode must be one of 'train' or 'predict', got {self.run_mode}" ) # Load common inputs - study_locus = StudyLocus.from_parquet( - self.session, self.study_locus_path, recursiveFileLookup=True + credible_set = StudyLocus.from_parquet( + self.session, self.credible_set_path, recursiveFileLookup=True ) studies = StudyIndex.from_parquet(self.session, self.study_index_path) v2g = V2G.from_parquet(self.session, self.variant_gene_path) @@ -132,7 +132,7 @@ def __post_init__(self: LocusToGeneStep) -> None: ) fm = L2GFeatureMatrix.generate_features( - study_locus=study_locus, + study_locus=credible_set, study_index=studies, variant_gene=v2g, # colocalisation=coloc, @@ -185,9 +185,9 @@ def __post_init__(self: LocusToGeneStep) -> None: raise ValueError( "model_path and predictions_path must be set for predict mode." ) - predictions = L2GPrediction.from_study_locus( + predictions = L2GPrediction.from_credible_set( self.model_path, - study_locus, + credible_set, studies, v2g, # coloc From aa05aa51375fe1c22842e07da0a7565b2cc3a1c7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 11:54:46 +0000 Subject: [PATCH 22/25] build(deps-dev): bump ipython from 8.17.2 to 8.18.1 (#280) Bumps [ipython](https://github.com/ipython/ipython) from 8.17.2 to 8.18.1. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/8.17.2...8.18.1) --- updated-dependencies: - dependency-name: ipython dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 17 ++++++++--------- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/poetry.lock b/poetry.lock index 32f676f53..465cbc5e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3998,24 +3998,23 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio" [[package]] name = "ipython" -version = "8.17.2" +version = "8.18.1" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.9" files = [ - {file = "ipython-8.17.2-py3-none-any.whl", hash = "sha256:1e4d1d666a023e3c93585ba0d8e962867f7a111af322efff6b9c58062b3e5444"}, - {file = "ipython-8.17.2.tar.gz", hash = "sha256:126bb57e1895594bb0d91ea3090bbd39384f6fe87c3d57fd558d0670f50339bb"}, + {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"}, + {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"}, ] [package.dependencies] -appnope = {version = "*", markers = "sys_platform == \"darwin\""} colorama = {version = "*", markers = "sys_platform == \"win32\""} decorator = "*" exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} jedi = ">=0.16" matplotlib-inline = "*" pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} -prompt-toolkit = ">=3.0.30,<3.0.37 || >3.0.37,<3.1.0" +prompt-toolkit = ">=3.0.41,<3.1.0" pygments = ">=2.4.0" stack-data = "*" traitlets = ">=5" @@ -5873,13 +5872,13 @@ dev = ["nose", "pipreqs", "twine"] [[package]] name = "prompt-toolkit" -version = "3.0.39" +version = "3.0.41" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.7.0" files = [ - {file = "prompt_toolkit-3.0.39-py3-none-any.whl", hash = "sha256:9dffbe1d8acf91e3de75f3b544e4842382fc06c6babe903ac9acb74dc6e08d88"}, - {file = "prompt_toolkit-3.0.39.tar.gz", hash = "sha256:04505ade687dc26dc4284b1ad19a83be2f2afe83e7a828ace0c72f3a1df72aac"}, + {file = "prompt_toolkit-3.0.41-py3-none-any.whl", hash = "sha256:f36fe301fafb7470e86aaf90f036eef600a3210be4decf461a5b1ca8403d3cb2"}, + {file = "prompt_toolkit-3.0.41.tar.gz", hash = "sha256:941367d97fc815548822aa26c2a269fdc4eb21e9ec05fc5d447cf09bad5d75f0"}, ] [package.dependencies] @@ -8402,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "8197d06ec721972d642dd466b4de4fed93c9b14a279749ee7eb4f91857f5fce3" +content-hash = "fc7dce2fb06e39a21e470de2771bf5487bfd660ef3c4ec4e5dd7385ddeaa16d8" diff --git a/pyproject.toml b/pyproject.toml index 6ca883855..c5beaf455 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ pytest-xdist = "^3.4.0" [tool.poetry.group.dev.dependencies] -ipython = "^8.5.0" +ipython = "^8.18.1" ipykernel = "^6.19.0" google-cloud-dataproc = "^5.7.0" apache-airflow = "^2.7.3" From feaf1afd1604f6bbd7d76d9187fb3a36fc07c928 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 12:18:25 +0000 Subject: [PATCH 23/25] build(deps-dev): bump mkdocstrings-python from 1.7.4 to 1.7.5 (#279) Bumps [mkdocstrings-python](https://github.com/mkdocstrings/python) from 1.7.4 to 1.7.5. - [Release notes](https://github.com/mkdocstrings/python/releases) - [Changelog](https://github.com/mkdocstrings/python/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/python/compare/1.7.4...1.7.5) --- updated-dependencies: - dependency-name: mkdocstrings-python dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 465cbc5e4..6eba09d58 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4959,13 +4959,13 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] [[package]] name = "mkdocstrings-python" -version = "1.7.4" +version = "1.7.5" description = "A Python handler for mkdocstrings." optional = false python-versions = ">=3.8" files = [ - {file = "mkdocstrings_python-1.7.4-py3-none-any.whl", hash = "sha256:70eacbe5f2d5071f2e525ba0b35bc447d398437dfbcd90c63fe6e977551cfe26"}, - {file = "mkdocstrings_python-1.7.4.tar.gz", hash = "sha256:c2fc34efd70000ec31aee247910006e8dd9d1b9f3957bf46880c3f6e51a8f0d5"}, + {file = "mkdocstrings_python-1.7.5-py3-none-any.whl", hash = "sha256:5f6246026353f0c0785135db70c3fe9a5d9318990fc7ceb11d62097b8ffdd704"}, + {file = "mkdocstrings_python-1.7.5.tar.gz", hash = "sha256:c7d143728257dbf1aa550446555a554b760dcd40a763f077189d298502b800be"}, ] [package.dependencies] @@ -8401,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "fc7dce2fb06e39a21e470de2771bf5487bfd660ef3c4ec4e5dd7385ddeaa16d8" +content-hash = "90889ae9da76eb541d14880319735717ebd5c30bd22af7c3af48d3f9206d3784" diff --git a/pyproject.toml b/pyproject.toml index c5beaf455..f3db534ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ ruff = "^0.1.3" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.3" -mkdocstrings-python = "^1.7.4" +mkdocstrings-python = "^1.7.5" mkdocs-material = "*" mkdocs-section-index = "^0.3.4" mkdocs-git-revision-date-localized-plugin = "^1.2.1" From 90c9ad3b6b100681a5840695655f351da1b5e5a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 13:43:06 +0100 Subject: [PATCH 24/25] build(deps-dev): bump ruff from 0.1.3 to 0.1.6 (#276) Bumps [ruff](https://github.com/astral-sh/ruff) from 0.1.3 to 0.1.6. - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/v0.1.3...v0.1.6) --- updated-dependencies: - dependency-name: ruff dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 40 ++++++++++++++++++++-------------------- pyproject.toml | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6eba09d58..d33f277de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7177,28 +7177,28 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.1.3" -description = "An extremely fast Python linter, written in Rust." +version = "0.1.6" +description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.3-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b46d43d51f7061652eeadb426a9e3caa1e0002470229ab2fc19de8a7b0766901"}, - {file = "ruff-0.1.3-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:b8afeb9abd26b4029c72adc9921b8363374f4e7edb78385ffaa80278313a15f9"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca3cf365bf32e9ba7e6db3f48a4d3e2c446cd19ebee04f05338bc3910114528b"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4874c165f96c14a00590dcc727a04dca0cfd110334c24b039458c06cf78a672e"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eec2dd31eed114e48ea42dbffc443e9b7221976554a504767ceaee3dd38edeb8"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dc3ec4edb3b73f21b4aa51337e16674c752f1d76a4a543af56d7d04e97769613"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e3de9ed2e39160800281848ff4670e1698037ca039bda7b9274f849258d26ce"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1c595193881922cc0556a90f3af99b1c5681f0c552e7a2a189956141d8666fe8"}, - {file = "ruff-0.1.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f75e670d529aa2288cd00fc0e9b9287603d95e1536d7a7e0cafe00f75e0dd9d"}, - {file = "ruff-0.1.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:76dd49f6cd945d82d9d4a9a6622c54a994689d8d7b22fa1322983389b4892e20"}, - {file = "ruff-0.1.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:918b454bc4f8874a616f0d725590277c42949431ceb303950e87fef7a7d94cb3"}, - {file = "ruff-0.1.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d8859605e729cd5e53aa38275568dbbdb4fe882d2ea2714c5453b678dca83784"}, - {file = "ruff-0.1.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:0b6c55f5ef8d9dd05b230bb6ab80bc4381ecb60ae56db0330f660ea240cb0d4a"}, - {file = "ruff-0.1.3-py3-none-win32.whl", hash = "sha256:3e7afcbdcfbe3399c34e0f6370c30f6e529193c731b885316c5a09c9e4317eef"}, - {file = "ruff-0.1.3-py3-none-win_amd64.whl", hash = "sha256:7a18df6638cec4a5bd75350639b2bb2a2366e01222825562c7346674bdceb7ea"}, - {file = "ruff-0.1.3-py3-none-win_arm64.whl", hash = "sha256:12fd53696c83a194a2db7f9a46337ce06445fb9aa7d25ea6f293cf75b21aca9f"}, - {file = "ruff-0.1.3.tar.gz", hash = "sha256:3ba6145369a151401d5db79f0a47d50e470384d0d89d0d6f7fab0b589ad07c34"}, + {file = "ruff-0.1.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:88b8cdf6abf98130991cbc9f6438f35f6e8d41a02622cc5ee130a02a0ed28703"}, + {file = "ruff-0.1.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5c549ed437680b6105a1299d2cd30e4964211606eeb48a0ff7a93ef70b902248"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cf5f701062e294f2167e66d11b092bba7af6a057668ed618a9253e1e90cfd76"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:05991ee20d4ac4bb78385360c684e4b417edd971030ab12a4fbd075ff535050e"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87455a0c1f739b3c069e2f4c43b66479a54dea0276dd5d4d67b091265f6fd1dc"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:683aa5bdda5a48cb8266fcde8eea2a6af4e5700a392c56ea5fb5f0d4bfdc0240"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:137852105586dcbf80c1717facb6781555c4e99f520c9c827bd414fac67ddfb6"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd98138a98d48a1c36c394fd6b84cd943ac92a08278aa8ac8c0fdefcf7138f35"}, + {file = "ruff-0.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a0cd909d25f227ac5c36d4e7e681577275fb74ba3b11d288aff7ec47e3ae745"}, + {file = "ruff-0.1.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e8fd1c62a47aa88a02707b5dd20c5ff20d035d634aa74826b42a1da77861b5ff"}, + {file = "ruff-0.1.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:fd89b45d374935829134a082617954120d7a1470a9f0ec0e7f3ead983edc48cc"}, + {file = "ruff-0.1.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:491262006e92f825b145cd1e52948073c56560243b55fb3b4ecb142f6f0e9543"}, + {file = "ruff-0.1.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:ea284789861b8b5ca9d5443591a92a397ac183d4351882ab52f6296b4fdd5462"}, + {file = "ruff-0.1.6-py3-none-win32.whl", hash = "sha256:1610e14750826dfc207ccbcdd7331b6bd285607d4181df9c1c6ae26646d6848a"}, + {file = "ruff-0.1.6-py3-none-win_amd64.whl", hash = "sha256:4558b3e178145491e9bc3b2ee3c4b42f19d19384eaa5c59d10acf6e8f8b57e33"}, + {file = "ruff-0.1.6-py3-none-win_arm64.whl", hash = "sha256:03910e81df0d8db0e30050725a5802441c2022ea3ae4fe0609b76081731accbc"}, + {file = "ruff-0.1.6.tar.gz", hash = "sha256:1b09f29b16c6ead5ea6b097ef2764b42372aebe363722f1605ecbcd2b9207184"}, ] [[package]] @@ -8401,4 +8401,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "3.10.8" -content-hash = "90889ae9da76eb541d14880319735717ebd5c30bd22af7c3af48d3f9206d3784" +content-hash = "2d34459308397c1956a5ff15a84d61fc7654959638e63017acc2eeaa92044b32" diff --git a/pyproject.toml b/pyproject.toml index f3db534ea..15a2d609e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ pep8-naming = "^0.13.2" interrogate = "^1.5.0" isort = "^5.12.0" darglint = "^1.8.1" -ruff = "^0.1.3" +ruff = "^0.1.6" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.3" From a4a44da2be20e907e5f6f4bc4967afb01edc2e90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Nov 2023 13:19:28 +0000 Subject: [PATCH 25/25] build(deps-dev): bump mypy from 1.7.0 to 1.7.1 (#278) Bumps [mypy](https://github.com/python/mypy) from 1.7.0 to 1.7.1. - [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md) - [Commits](https://github.com/python/mypy/compare/v1.7.0...v1.7.1) --- updated-dependencies: - dependency-name: mypy dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 56 ++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/poetry.lock b/poetry.lock index d33f277de..2ba9c72e6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5115,38 +5115,38 @@ files = [ [[package]] name = "mypy" -version = "1.7.0" +version = "1.7.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5da84d7bf257fd8f66b4f759a904fd2c5a765f70d8b52dde62b521972a0a2357"}, - {file = "mypy-1.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a3637c03f4025f6405737570d6cbfa4f1400eb3c649317634d273687a09ffc2f"}, - {file = "mypy-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b633f188fc5ae1b6edca39dae566974d7ef4e9aaaae00bc36efe1f855e5173ac"}, - {file = "mypy-1.7.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d6ed9a3997b90c6f891138e3f83fb8f475c74db4ccaa942a1c7bf99e83a989a1"}, - {file = "mypy-1.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:1fe46e96ae319df21359c8db77e1aecac8e5949da4773c0274c0ef3d8d1268a9"}, - {file = "mypy-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:df67fbeb666ee8828f675fee724cc2cbd2e4828cc3df56703e02fe6a421b7401"}, - {file = "mypy-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a79cdc12a02eb526d808a32a934c6fe6df07b05f3573d210e41808020aed8b5d"}, - {file = "mypy-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f65f385a6f43211effe8c682e8ec3f55d79391f70a201575def73d08db68ead1"}, - {file = "mypy-1.7.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0e81ffd120ee24959b449b647c4b2fbfcf8acf3465e082b8d58fd6c4c2b27e46"}, - {file = "mypy-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:f29386804c3577c83d76520abf18cfcd7d68264c7e431c5907d250ab502658ee"}, - {file = "mypy-1.7.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:87c076c174e2c7ef8ab416c4e252d94c08cd4980a10967754f91571070bf5fbe"}, - {file = "mypy-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6cb8d5f6d0fcd9e708bb190b224089e45902cacef6f6915481806b0c77f7786d"}, - {file = "mypy-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93e76c2256aa50d9c82a88e2f569232e9862c9982095f6d54e13509f01222fc"}, - {file = "mypy-1.7.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cddee95dea7990e2215576fae95f6b78a8c12f4c089d7e4367564704e99118d3"}, - {file = "mypy-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:d01921dbd691c4061a3e2ecdbfbfad029410c5c2b1ee88946bf45c62c6c91210"}, - {file = "mypy-1.7.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:185cff9b9a7fec1f9f7d8352dff8a4c713b2e3eea9c6c4b5ff7f0edf46b91e41"}, - {file = "mypy-1.7.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7a7b1e399c47b18feb6f8ad4a3eef3813e28c1e871ea7d4ea5d444b2ac03c418"}, - {file = "mypy-1.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9fe455ad58a20ec68599139ed1113b21f977b536a91b42bef3ffed5cce7391"}, - {file = "mypy-1.7.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d0fa29919d2e720c8dbaf07d5578f93d7b313c3e9954c8ec05b6d83da592e5d9"}, - {file = "mypy-1.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:2b53655a295c1ed1af9e96b462a736bf083adba7b314ae775563e3fb4e6795f5"}, - {file = "mypy-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c1b06b4b109e342f7dccc9efda965fc3970a604db70f8560ddfdee7ef19afb05"}, - {file = "mypy-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bf7a2f0a6907f231d5e41adba1a82d7d88cf1f61a70335889412dec99feeb0f8"}, - {file = "mypy-1.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551d4a0cdcbd1d2cccdcc7cb516bb4ae888794929f5b040bb51aae1846062901"}, - {file = "mypy-1.7.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55d28d7963bef00c330cb6461db80b0b72afe2f3c4e2963c99517cf06454e665"}, - {file = "mypy-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:870bd1ffc8a5862e593185a4c169804f2744112b4a7c55b93eb50f48e7a77010"}, - {file = "mypy-1.7.0-py3-none-any.whl", hash = "sha256:96650d9a4c651bc2a4991cf46f100973f656d69edc7faf91844e87fe627f7e96"}, - {file = "mypy-1.7.0.tar.gz", hash = "sha256:1e280b5697202efa698372d2f39e9a6713a0395a756b1c6bd48995f8d72690dc"}, + {file = "mypy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:12cce78e329838d70a204293e7b29af9faa3ab14899aec397798a4b41be7f340"}, + {file = "mypy-1.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1484b8fa2c10adf4474f016e09d7a159602f3239075c7bf9f1627f5acf40ad49"}, + {file = "mypy-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31902408f4bf54108bbfb2e35369877c01c95adc6192958684473658c322c8a5"}, + {file = "mypy-1.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f2c2521a8e4d6d769e3234350ba7b65ff5d527137cdcde13ff4d99114b0c8e7d"}, + {file = "mypy-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:fcd2572dd4519e8a6642b733cd3a8cfc1ef94bafd0c1ceed9c94fe736cb65b6a"}, + {file = "mypy-1.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4b901927f16224d0d143b925ce9a4e6b3a758010673eeded9b748f250cf4e8f7"}, + {file = "mypy-1.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7f6985d05a4e3ce8255396df363046c28bea790e40617654e91ed580ca7c51"}, + {file = "mypy-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:944bdc21ebd620eafefc090cdf83158393ec2b1391578359776c00de00e8907a"}, + {file = "mypy-1.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9c7ac372232c928fff0645d85f273a726970c014749b924ce5710d7d89763a28"}, + {file = "mypy-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:f6efc9bd72258f89a3816e3a98c09d36f079c223aa345c659622f056b760ab42"}, + {file = "mypy-1.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6dbdec441c60699288adf051f51a5d512b0d818526d1dcfff5a41f8cd8b4aaf1"}, + {file = "mypy-1.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4fc3d14ee80cd22367caaaf6e014494415bf440980a3045bf5045b525680ac33"}, + {file = "mypy-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c6e4464ed5f01dc44dc9821caf67b60a4e5c3b04278286a85c067010653a0eb"}, + {file = "mypy-1.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:d9b338c19fa2412f76e17525c1b4f2c687a55b156320acb588df79f2e6fa9fea"}, + {file = "mypy-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:204e0d6de5fd2317394a4eff62065614c4892d5a4d1a7ee55b765d7a3d9e3f82"}, + {file = "mypy-1.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:84860e06ba363d9c0eeabd45ac0fde4b903ad7aa4f93cd8b648385a888e23200"}, + {file = "mypy-1.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8c5091ebd294f7628eb25ea554852a52058ac81472c921150e3a61cdd68f75a7"}, + {file = "mypy-1.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40716d1f821b89838589e5b3106ebbc23636ffdef5abc31f7cd0266db936067e"}, + {file = "mypy-1.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5cf3f0c5ac72139797953bd50bc6c95ac13075e62dbfcc923571180bebb662e9"}, + {file = "mypy-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:78e25b2fd6cbb55ddfb8058417df193f0129cad5f4ee75d1502248e588d9e0d7"}, + {file = "mypy-1.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:75c4d2a6effd015786c87774e04331b6da863fc3fc4e8adfc3b40aa55ab516fe"}, + {file = "mypy-1.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2643d145af5292ee956aa0a83c2ce1038a3bdb26e033dadeb2f7066fb0c9abce"}, + {file = "mypy-1.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75aa828610b67462ffe3057d4d8a4112105ed211596b750b53cbfe182f44777a"}, + {file = "mypy-1.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ee5d62d28b854eb61889cde4e1dbc10fbaa5560cb39780c3995f6737f7e82120"}, + {file = "mypy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:72cf32ce7dd3562373f78bd751f73c96cfb441de147cc2448a92c1a308bd0ca6"}, + {file = "mypy-1.7.1-py3-none-any.whl", hash = "sha256:f7c5d642db47376a0cc130f0de6d055056e010debdaf0707cd2b0fc7e7ef30ea"}, + {file = "mypy-1.7.1.tar.gz", hash = "sha256:fcb6d9afb1b6208b4c712af0dafdc650f518836065df0d4fb1d800f5d6773db2"}, ] [package.dependencies]