-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add interval logic for l2g features #812
base: dev
Are you sure you want to change the base?
Changes from all commits
9c31f43
330b79e
183c827
500bae8
7cb4b5f
2035a52
985a901
b01b4e8
688c73a
6837df3
f194098
a9c0f6b
63d6db6
374a7c3
29ad08b
42e4ce9
55f947f
1de5fcf
c332d93
ee8c4f2
737a827
921c820
0e23427
6ac2d12
b1b2aa5
4f893fb
2bbf69c
aed12ec
37109e3
054eaa3
ad934c4
0eea3aa
8140d5a
24dc8c3
9aeb302
155fcdb
53a6ff3
b8914a7
78f661b
cf8b260
880cacf
c076e17
b074bc4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
--- | ||
title: From intervals | ||
--- | ||
|
||
## List of features | ||
|
||
::: gentropy.dataset.l2g_features.intervals.PchicMeanFeature | ||
::: gentropy.dataset.l2g_features.intervals.PchicMeanNeighbourhoodFeature | ||
::: gentropy.dataset.l2g_features.intervals.EnhTssCorrelationMeanFeature | ||
::: gentropy.dataset.l2g_features.intervals.EnhTssCorrelationMeanNeighbourhoodFeature | ||
::: gentropy.dataset.l2g_features.intervals.DhsPmtrCorrelationMeanFeature | ||
::: gentropy.dataset.l2g_features.intervals.DhsPmtrCorrelationMeanNeighbourhoodFeature | ||
|
||
## Common logic | ||
|
||
::: gentropy.dataset.l2g_features.intervals.common_interval_feature_logic | ||
::: gentropy.dataset.l2g_features.intervals.common_neighbourhood_interval_feature_logic |
xyg123 marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,7 @@ class SessionConfig: | |
write_mode: str = "errorifexists" | ||
spark_uri: str = "local[*]" | ||
hail_home: str = os.path.dirname(hail_location) | ||
extended_spark_conf: dict[str, str] | None = field( | ||
default_factory=dict[str, str]) | ||
extended_spark_conf: dict[str, str] | None = field(default_factory=dict[str, str]) | ||
output_partitions: int = 200 | ||
_target_: str = "gentropy.common.session.Session" | ||
|
||
|
@@ -40,8 +39,7 @@ class ColocalisationConfig(StepConfig): | |
credible_set_path: str = MISSING | ||
coloc_path: str = MISSING | ||
colocalisation_method: str = MISSING | ||
colocalisation_method_params: dict[str, Any] = field( | ||
default_factory=dict[str, Any]) | ||
colocalisation_method_params: dict[str, Any] = field(default_factory=dict[str, Any]) | ||
_target_: str = "gentropy.colocalisation.ColocalisationStep" | ||
|
||
|
||
|
@@ -126,8 +124,7 @@ class EqtlCatalogueConfig(StepConfig): | |
eqtl_catalogue_paths_imported: str = MISSING | ||
eqtl_catalogue_study_index_out: str = MISSING | ||
eqtl_catalogue_credible_sets_out: str = MISSING | ||
mqtl_quantification_methods_blacklist: list[str] = field( | ||
default_factory=lambda: []) | ||
mqtl_quantification_methods_blacklist: list[str] = field(default_factory=lambda: []) | ||
eqtl_lead_pvalue_threshold: float = 1e-3 | ||
_target_: str = "gentropy.eqtl_catalogue.EqtlCatalogueStep" | ||
|
||
|
@@ -217,6 +214,18 @@ class LDBasedClumpingConfig(StepConfig): | |
_target_: str = "gentropy.ld_based_clumping.LDBasedClumpingStep" | ||
|
||
|
||
@dataclass | ||
class IntervalConfig(StepConfig): | ||
"""Interval step configuration.""" | ||
|
||
gene_index_path: str = MISSING | ||
liftover_chain_file_path: str = MISSING | ||
max_distance: int = 250_000 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not 500kb? |
||
interval_sources: dict[str, str] = MISSING | ||
processed_interval_path: str = MISSING | ||
_target_: str = "gentropy.intervals.IntervalStep" | ||
|
||
|
||
@dataclass | ||
class LocusToGeneConfig(StepConfig): | ||
"""Locus to gene step configuration.""" | ||
|
@@ -263,6 +272,13 @@ class LocusToGeneConfig(StepConfig): | |
"vepMaximumNeighbourhood", | ||
"vepMean", | ||
"vepMeanNeighbourhood", | ||
# intervals | ||
"pchicMean", | ||
"pchicMeanNeighbourhood", | ||
"enhTssCorrelationMean", | ||
"enhTssCorrelationMeanNeighbourhood", | ||
"dhsPmtrCorrelationMean", | ||
"dhsPmtrCorrelationMeanNeighbourhood", | ||
# other | ||
"geneCount500kb", | ||
"proteinGeneCount500kb", | ||
|
@@ -306,6 +322,7 @@ class LocusToGeneFeatureMatrixConfig(StepConfig): | |
colocalisation_path: str | None = None | ||
study_index_path: str | None = None | ||
gene_index_path: str | None = None | ||
interval_path: str | None = None | ||
feature_matrix_path: str = MISSING | ||
features_list: list[str] = field( | ||
default_factory=lambda: [ | ||
|
@@ -340,6 +357,13 @@ class LocusToGeneFeatureMatrixConfig(StepConfig): | |
"vepMaximumNeighbourhood", | ||
"vepMean", | ||
"vepMeanNeighbourhood", | ||
# intervals | ||
"pchicMean", | ||
"pchicMeanNeighbourhood", | ||
"enhTssCorrelationMean", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like more readable feature names to pick up what they represent easily |
||
"enhTssCorrelationMeanNeighbourhood", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like more readable feature names to pick up what they represent easily |
||
"dhsPmtrCorrelationMean", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like more readable feature names to pick up what they represent easily |
||
"dhsPmtrCorrelationMeanNeighbourhood", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like more readable feature names to pick up what they represent easily |
||
# other | ||
"geneCount500kb", | ||
"proteinGeneCount500kb", | ||
|
@@ -681,8 +705,7 @@ class Config: | |
"""Application configuration.""" | ||
|
||
# this is unfortunately verbose due to @dataclass limitations | ||
defaults: List[Any] = field(default_factory=lambda: [ | ||
"_self_", {"step": MISSING}]) | ||
defaults: List[Any] = field(default_factory=lambda: ["_self_", {"step": MISSING}]) | ||
step: StepConfig = MISSING | ||
datasets: dict[str, str] = field(default_factory=dict) | ||
|
||
|
@@ -716,8 +739,7 @@ def register_config() -> None: | |
name="gwas_catalog_top_hit_ingestion", | ||
node=GWASCatalogTopHitIngestionConfig, | ||
) | ||
cs.store(group="step", name="ld_based_clumping", | ||
node=LDBasedClumpingConfig) | ||
cs.store(group="step", name="ld_based_clumping", node=LDBasedClumpingConfig) | ||
cs.store(group="step", name="ld_index", node=LDIndexConfig) | ||
cs.store(group="step", name="locus_to_gene", node=LocusToGeneConfig) | ||
cs.store( | ||
|
@@ -735,8 +757,7 @@ def register_config() -> None: | |
|
||
cs.store(group="step", name="pics", node=PICSConfig) | ||
cs.store(group="step", name="gnomad_variants", node=GnomadVariantConfig) | ||
cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", | ||
node=UkbPppEurConfig) | ||
cs.store(group="step", name="ukb_ppp_eur_sumstat_preprocess", node=UkbPppEurConfig) | ||
cs.store(group="step", name="variant_index", node=VariantIndexConfig) | ||
cs.store(group="step", name="variant_to_vcf", node=ConvertToVcfStepConfig) | ||
cs.store( | ||
|
@@ -769,7 +790,5 @@ def register_config() -> None: | |
name="locus_to_gene_associations", | ||
node=LocusToGeneAssociationsStepConfig, | ||
) | ||
cs.store(group="step", name="finngen_ukb_meta_ingestion", | ||
node=FinngenUkbMetaConfig) | ||
cs.store(group="step", name="credible_set_qc", | ||
node=CredibleSetQCStepConfig) | ||
cs.store(group="step", name="finngen_ukb_meta_ingestion", node=FinngenUkbMetaConfig) | ||
cs.store(group="step", name="credible_set_qc", node=CredibleSetQCStepConfig) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,17 +5,19 @@ | |
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
import pyspark.sql.functions as f | ||
|
||
from gentropy.common.Liftover import LiftOverSpark | ||
from gentropy.common.schemas import parse_spark_schema | ||
from gentropy.dataset.dataset import Dataset | ||
from gentropy.dataset.gene_index import GeneIndex | ||
from gentropy.dataset.variant_index import VariantIndex | ||
|
||
if TYPE_CHECKING: | ||
from pyspark.sql import SparkSession | ||
from pyspark.sql.types import StructType | ||
|
||
|
||
|
||
@dataclass | ||
class Intervals(Dataset): | ||
"""Intervals dataset links genes to genomic regions based on genome interaction studies.""" | ||
|
@@ -71,3 +73,35 @@ def from_source( | |
source_class = source_to_class[source_name] | ||
data = source_class.read(spark, source_path) # type: ignore | ||
return source_class.parse(data, gene_index, lift) # type: ignore | ||
|
||
def overlap_variant_index( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure it is the best idea to introduce the variant field in the intervals data model. It sounds counter intuitive to derail the definition of this dataset from something that describes genomic ranges. However, if you decide to keep it, I suggest renaming to something like |
||
self: Intervals, variant_index: VariantIndex | ||
) -> Intervals: | ||
"""Overlaps intervals with a variant index. | ||
|
||
Args: | ||
variant_index (VariantIndex): Variant index dataset | ||
|
||
Returns: | ||
Intervals: Variant-to-gene intervals dataset | ||
""" | ||
return Intervals( | ||
_df=( | ||
self.df.alias("interval") | ||
.join( | ||
variant_index.df.selectExpr( | ||
"chromosome as vi_chromosome", "variantId", "position" | ||
).alias("vi"), | ||
on=[ | ||
f.col("vi.vi_chromosome") == f.col("interval.chromosome"), | ||
f.col("vi.position").between( | ||
f.col("interval.start"), f.col("interval.end") | ||
), | ||
], | ||
how="inner", | ||
) | ||
.drop("vi_chromosome", "position") | ||
# .drop("start", "end", "vi_chromosome", "position") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you have to uncomment this? |
||
), | ||
_schema=Intervals.get_schema(), | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why nullable?