Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: multiple python version support with latest pyspark and hail #974

Merged
merged 47 commits into from
Jan 28, 2025
Merged
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
90e6028
chore(pyspark): update to 3.5.X
SzymonSzyszkowski Jan 17, 2025
630c0c9
chore: fix doctest syntax
SzymonSzyszkowski Jan 17, 2025
1dbe1b0
chore: bump temurin version to 11
SzymonSzyszkowski Jan 17, 2025
bcf0b9a
feat: allow multiple python versions
Jan 21, 2025
4d3380a
feat: python matrix for gha
Jan 21, 2025
28b3e2c
chore: pre-commit auto fixes [...]
pre-commit-ci[bot] Jan 21, 2025
9cc2c78
Merge branch 'dev' into pyspark-bump
project-defiant Jan 21, 2025
fbaa8d9
chore: typos
Jan 21, 2025
7f416ed
chore: fix python version in setup dev script
Jan 21, 2025
5a9cd8f
fix: attempt to fix the 3.11 tests
Jan 21, 2025
c46cdab
fix: set the session correctly in variant_index_config
Jan 21, 2025
18c66b1
Revert "chore: fix doctest syntax"
Jan 21, 2025
def0fbb
chore: update dependencies
Jan 21, 2025
4eabc51
Revert "Revert "chore: fix doctest syntax""
Jan 21, 2025
c350211
chore: bump image to 2.2
Jan 21, 2025
8c2fa2b
chore: update lock files
Jan 21, 2025
1719b5c
build: poetry cleanup
Jan 22, 2025
c45ac1c
build: uv checks droped
Jan 22, 2025
1fccce6
chore: fix dockerfile and install test deps
Jan 22, 2025
4c1efbd
build(uv): add all dependencies to run tests
Jan 22, 2025
1e913a8
Merge branch 'dev' into pyspark-bump
project-defiant Jan 22, 2025
08e03e3
chore: fix test issue with rounding error
Jan 22, 2025
f9fc356
chore: fix dependency version lower bounds
Jan 22, 2025
f1ff1f9
chore: add .python-version file to ignored
Jan 22, 2025
98d464d
build: new setup
Jan 23, 2025
aa64db9
build: new setup
Jan 23, 2025
570e33e
build: new setup
Jan 23, 2025
89a9c34
build: new setup
Jan 23, 2025
2db0610
build: new setup
Jan 23, 2025
b392368
revert: bring back initialization actions
Jan 23, 2025
2329a8a
chore: align variable name
Jan 23, 2025
04c2ed2
chore: update pre-commit python version
Jan 23, 2025
979325d
chore: docs update
Jan 23, 2025
3eb7d55
Merge branch 'dev' into pyspark-bump
project-defiant Jan 24, 2025
79022b9
feat: more complex uv installation
SzymonSzyszkowski Jan 27, 2025
9adc76a
feat: notify to source shellrc file when installing uv
SzymonSzyszkowski Jan 27, 2025
7d10b63
fix: checks
SzymonSzyszkowski Jan 27, 2025
a17290d
chore: debug gha
SzymonSzyszkowski Jan 27, 2025
79de16b
chore: debug gha
SzymonSzyszkowski Jan 27, 2025
e7d5cd8
feat: debug gha
SzymonSzyszkowski Jan 27, 2025
f4ab0d0
feat: debug gha
SzymonSzyszkowski Jan 27, 2025
3bc1318
feat: debug gha
SzymonSzyszkowski Jan 27, 2025
a82b86c
feat: force user shell
SzymonSzyszkowski Jan 27, 2025
09b83a8
feat: gha debug
SzymonSzyszkowski Jan 27, 2025
640f493
feat: gha debug
SzymonSzyszkowski Jan 27, 2025
4a2018a
feat: gha debug
SzymonSzyszkowski Jan 27, 2025
e3ea829
feat: gha debug
SzymonSzyszkowski Jan 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Revert "chore: fix doctest syntax"
This reverts commit 630c0c9.
  • Loading branch information
Szymon Szyszkowski committed Jan 21, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 18c66b1d1d9c4833e3dbfc406ba3440577a119a5
18 changes: 9 additions & 9 deletions src/gentropy/common/spark_helpers.py
Original file line number Diff line number Diff line change
@@ -119,7 +119,7 @@ def pvalue_to_zscore(pval_col: Column) -> Column:
| t3| 0.05| 1.959964|
| t4| 1e-300| 37.537838|
| t5|1e-1000| 37.537838|
| t6| NA| NULL|
| t6| NA| null|
+---+-------+----------+
<BLANKLINE>
@@ -149,7 +149,7 @@ def nullify_empty_array(column: Column) -> Column:
+---------+---------+
| value| new|
+---------+---------+
| []| NULL|
| []| null|
|[1, 2, 3]|[1, 2, 3]|
+---------+---------+
<BLANKLINE>
@@ -472,8 +472,8 @@ def map_column_by_dictionary(col: Column, mapping_dict: dict[str, Any]) -> Colum
| label| id|
+--------------------+---------+
| consequence_1|SO:000000|
|unmapped_consequence| NULL|
| NULL| NULL|
|unmapped_consequence| null|
| null| null|
+--------------------+---------+
<BLANKLINE>
"""
@@ -604,7 +604,7 @@ def rename_all_columns(df: DataFrame, prefix: str) -> DataFrame:
+-----------+-----------+-----------+
| a| 1.2| true|
| b| 0.0| false|
| c| NULL| NULL|
| c| null| null|
+-----------+-----------+-----------+
<BLANKLINE>
"""
@@ -649,7 +649,7 @@ def safe_array_union(
|[a, b]|
| [c]|
| [d]|
| NULL|
| null|
+------+
<BLANKLINE>
>>> schema="arr2: array<struct<b:int,a:string>>, arr: array<struct<a:string,b:int>>"
@@ -752,7 +752,7 @@ def create_empty_column_if_not_exists(
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| 2|NULL|
| 1| 2|null|
+----+----+----+
<BLANKLINE>
"""
@@ -782,8 +782,8 @@ def get_standard_error_from_confidence_interval(lower: Column, upper: Column) ->
| standard_error|
+-------------------+
|0.25510204081632654|
| NULL|
| NULL|
| null|
| null|
+-------------------+
<BLANKLINE>
"""
8 changes: 4 additions & 4 deletions src/gentropy/common/utils.py
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@ def calculate_confidence_interval(
|pvalue_mantissa|pvalue_exponent|beta|standard_error|betaConfidenceIntervalLower|betaConfidenceIntervalUpper|
+---------------+---------------+----+--------------+---------------------------+---------------------------+
| 2.5| -10| 0.5| 0.2| 0.10799999999999998| 0.892|
| 3.0| -5| 1.0| NULL| 0.5303663900832607| 1.4696336099167393|
| 3.0| -5| 1.0| null| 0.5303663900832607| 1.4696336099167393|
| 1.5| -8|-0.2| 0.1| -0.396| -0.00400000000000...|
+---------------+---------------+----+--------------+---------------------------+---------------------------+
<BLANKLINE>
@@ -98,9 +98,9 @@ def convert_odds_ratio_to_beta(
|beta|oddsRatio|standardError| beta|standardError|
+----+---------+-------------+-------------------+-------------+
| 0.1| 1.1| 0.1| 0.1| 0.1|
|NULL| 1.1| 0.1|0.09531017980432493| NULL|
| 0.1| NULL| 0.1| 0.1| 0.1|
| 0.1| 1.1| NULL| 0.1| NULL|
|null| 1.1| 0.1|0.09531017980432493| null|
| 0.1| null| 0.1| 0.1| 0.1|
| 0.1| 1.1| null| 0.1| null|
+----+---------+-------------+-------------------+-------------+
<BLANKLINE>
2 changes: 1 addition & 1 deletion src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
@@ -94,7 +94,7 @@ def hash_long_variant_ids(
|v_short |v_short |
|v_looooooong|OTVAR_23_23_3749d019d645894770c364992ae70a05|
|no_chrom |OTVAR_41acfcd7d4fd523b33600b504914ef25 |
|NULL |NULL |
|null |null |
+------------+--------------------------------------------+
<BLANKLINE>
"""
10 changes: 5 additions & 5 deletions src/gentropy/datasource/ensembl/vep_parser.py
Original file line number Diff line number Diff line change
@@ -192,7 +192,7 @@ def _colocated_variants_to_rsids(colocated_variants: Column) -> Column:
|rsIds |
+---------------+
|[rs1, rs2, rs3]|
|[NULL] |
|[null] |
+---------------+
<BLANKLINE>
"""
@@ -380,8 +380,8 @@ def _get_max_alpha_missense(transcripts: Column) -> Column:
+-----------------------------------------------------+
|am |
+-----------------------------------------------------+
|{AlphaMissense, assessment 1, 0.4, NULL, gene1, NULL}|
|{AlphaMissense, NULL, NULL, NULL, gene1, NULL} |
|{AlphaMissense, assessment 1, 0.4, null, gene1, null}|
|{AlphaMissense, null, null, null, gene1, null} |
+-----------------------------------------------------+
<BLANKLINE>
"""
@@ -513,8 +513,8 @@ def _parser_amino_acid_change(amino_acids: Column, protein_end: Column) -> Colum
|amino_acid_change|
+-----------------+
| A1B|
| NULL|
| NULL|
| null|
| null|
+-----------------+
<BLANKLINE>
"""
45 changes: 24 additions & 21 deletions src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
@@ -120,9 +120,9 @@ def _normalise_pvaluetext(p_value_text: Column) -> Column:
| European Ancestry| [EA]|
| African ancestry| [AA]|
|Alzheimer’s Disease| [AD]|
| (progression)| NULL|
| | NULL|
| NULL| NULL|
| (progression)| null|
| | null|
| null| null|
+-------------------+----------+
<BLANKLINE>
@@ -297,7 +297,9 @@ def _map_variants_to_gnomad_variants(
"position",
)

return gwas_associations.join(fully_mapped_associations, on="rowId", how="left")
return gwas_associations.join(
fully_mapped_associations, on="rowId", how="left"
)

@staticmethod
def _compare_rsids(gnomad: Column, gwas: Column) -> Column:
@@ -421,7 +423,7 @@ def _check_concordance(
| A| T| G| true|
| A| C| G| false|
| A| A| ?| true|
| NULL| NULL| A| true|
| null| null| A| true|
+----------+---------------+---------------+------------+
<BLANKLINE>
@@ -479,7 +481,7 @@ def _get_reverse_complement(allele_col: Column) -> Column:
| AC| GT|
|GTaatc| GATTAC|
| ?| ?|
| NULL| NULL|
| null| null|
+------+-------------+
<BLANKLINE>
@@ -549,7 +551,7 @@ def _are_alleles_palindromic(
| AG| AT| false|
| AT| AT| true|
| CATATG| CATATG| true|
| NULL| -| false|
| null| -| false|
+---------+---------+--------------+
<BLANKLINE>
@@ -651,7 +653,7 @@ def _harmonise_odds_ratio(
| 0.5|false| 0.5|
| 0.5| true| 2.0|
| 0.0|false| 0.0|
| 0.0| true| NULL|
| 0.0| true| null|
+------+-----+----------+
<BLANKLINE>
"""
@@ -691,7 +693,7 @@ def _concatenate_substudy_description(
|association_trait|mapped_trait_uri |pvalue_text |substudy_description |
+-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+
|Height |http://www.ebi.ac.uk/efo/EFO_0000001,http://www.ebi.ac.uk/efo/EFO_0000002|European Ancestry|Height|EA|EFO_0000001/EFO_0000002 |
|Schizophrenia |http://www.ebi.ac.uk/efo/MONDO_0005090 |NULL |Schizophrenia|no_pvalue_text|MONDO_0005090|
|Schizophrenia |http://www.ebi.ac.uk/efo/MONDO_0005090 |null |Schizophrenia|no_pvalue_text|MONDO_0005090|
+-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+
<BLANKLINE>
"""
@@ -801,9 +803,9 @@ def _qc_genomic_location(
+----------------------------+----------+--------+
|qc |chromosome|position|
+----------------------------+----------+--------+
|[Incomplete genomic mapping]|NULL |NULL |
|[Incomplete genomic mapping]|1 |NULL |
|[Incomplete genomic mapping]|NULL |1 |
|[Incomplete genomic mapping]|null |null |
|[Incomplete genomic mapping]|1 |null |
|[Incomplete genomic mapping]|null |1 |
|[] |1 |1 |
+----------------------------+----------+--------+
<BLANKLINE>
@@ -865,8 +867,8 @@ def _qc_unmapped_variants(qc: Column, alternate_allele: Column) -> Column:
+----------------+----+--------------------+
|alternate_allele| qc| new_qc|
+----------------+----+--------------------+
| A|NULL| []|
| NULL|NULL|[No mapping in Gn...|
| A|null| []|
| null|null|[No mapping in Gn...|
+----------------+----+--------------------+
<BLANKLINE>
@@ -938,7 +940,7 @@ def _get_effect_type(ci_text: Column) -> Column:
|95% CI: [0.1-0.2] |odds_ratio |
|95% CI: [0.1-0.2] increase|beta |
|95% CI: [0.1-0.2] decrease|beta |
|NULL |NULL |
|null |null |
+--------------------------+-----------+
<BLANKLINE>
@@ -992,11 +994,11 @@ def harmonise_association_effect_to_beta(
+-------------------------+---------------+---------------+----------+--------------------+-------------------+--------------------+
|STRONGEST SNP-RISK ALLELE|referenceAllele|alternateAllele|OR or BETA| 95% CI (TEXT)| beta| standardError|
+-------------------------+---------------+---------------+----------+--------------------+-------------------+--------------------+
| rs123-T| A| T| 0.1|[0.08-0.12] unit ...| NULL| NULL|
| rs123-T| A| T| 0.1|[0.08-0.12] unit ...| null| null|
| rs123-C| G| T| 0.1|[0.08-0.12] unit ...| -0.1|0.010204081404574064|
| rs123-T| C| T| 0.1|[0.08-0.12] unit ...| 0.1|0.010204081404574064|
| rs123-T| C| T| 0.1| [0.08-0.12]|-2.3025850929940455| NULL|
| rs123-C| G| T| 0.1| [0.08-0.12]| 2.302585092994046| NULL|
| rs123-T| C| T| 0.1| [0.08-0.12]|-2.3025850929940455| null|
| rs123-C| G| T| 0.1| [0.08-0.12]| 2.302585092994046| null|
+-------------------------+---------------+---------------+----------+--------------------+-------------------+--------------------+
<BLANKLINE>
"""
@@ -1106,8 +1108,7 @@ def from_source(
return StudyLocusGWASCatalog(
_df=gwas_associations.withColumn(
# temporary column
"rowId",
f.monotonically_increasing_id().cast(StringType()),
"rowId", f.monotonically_increasing_id().cast(StringType())
)
.transform(
# Map/harmonise variants to variant annotation dataset:
@@ -1139,7 +1140,9 @@ def from_source(
# Adding study-locus id:
.withColumn(
"studyLocusId",
StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
StudyLocus.assign_study_locus_id(
["studyId", "variantId"]
),
)
.select(
# INSIDE STUDY-LOCUS SCHEMA:
4 changes: 2 additions & 2 deletions src/gentropy/datasource/gwas_catalog/study_index.py
Original file line number Diff line number Diff line change
@@ -219,7 +219,7 @@ def parse_cohorts(raw_cohort: Column) -> Column:
|parsedCohorts |
+--------------------------------------+
|[BioME, CaPS, Estonia, FHS, UKB, GERA]|
|NULL |
|null |
+--------------------------------------+
<BLANKLINE>
"""
@@ -655,7 +655,7 @@ def add_no_sumstats_flag(self: StudyIndexGWASCatalog) -> StudyIndexGWASCatalog:
"""
self.df = self.df.withColumn(
"qualityControls",
f.array(f.lit(StudyQualityCheck.SUMSTATS_NOT_AVAILABLE.value)),
f.array(f.lit(StudyQualityCheck.SUMSTATS_NOT_AVAILABLE.value))
)
return self

12 changes: 6 additions & 6 deletions src/gentropy/method/colocalisation.py
Original file line number Diff line number Diff line change
@@ -51,8 +51,8 @@ def get_tag_variant_source(statistics: Column) -> Column:
| a| b|source|
+----+----+------+
| a| b| both|
|NULL| b| right|
| a|NULL| left|
|null| b| right|
| a|null| left|
+----+----+------+
<BLANKLINE>
"""
@@ -181,8 +181,8 @@ def colocalise(
.withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME))
.join(
overlapping_signals.calculate_beta_ratio(),
on=["leftStudyLocusId", "rightStudyLocusId", "chromosome"],
how="left",
on=["leftStudyLocusId", "rightStudyLocusId","chromosome"],
how="left"
)
),
_schema=Colocalisation.get_schema(),
@@ -386,8 +386,8 @@ def colocalise(
.withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME))
.join(
overlapping_signals.calculate_beta_ratio(),
on=["leftStudyLocusId", "rightStudyLocusId", "chromosome"],
how="left",
on=["leftStudyLocusId", "rightStudyLocusId","chromosome"],
how="left"
)
),
_schema=Colocalisation.get_schema(),