Skip to content

Commit

Permalink
improving pylint score
Browse files Browse the repository at this point in the history
  • Loading branch information
Dainius Kirsnauskas committed Sep 15, 2024
1 parent 31f85fc commit 3b909a9
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 107 deletions.
2 changes: 1 addition & 1 deletion api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)

# DATA COLLECTION IMPORT
from .data.collection import (
from .data.downloading import (
# Custom exceptions
BadResponseException,
DownloadError,
Expand Down
65 changes: 0 additions & 65 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,65 +0,0 @@
"""
Package for data collection purposes provides both collection and refactoring functionality.
Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
All necessary functionality can be imported directly from data without
specifying the module.
data collection pipeline example is established for project's specific usage.
"""

# CONSTANTS IMPORT
from .constants import (
# URLs for LOVD database
LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,

# URLs for gnomAD database
GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,

# URLs for ClinVar database
CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,

# Paths for data storage
DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,

# Data types for tables
LOVD_TABLES_DATA_TYPES,

# Paths for database downloads
DATABASES_DOWNLOAD_PATHS,

GNOMAD_PATH,
)

# DATA COLLECTION IMPORT
from .collection import (
# Custom exceptions
BadResponseException,
DownloadError,

# Custom utility functions
get_file_from_url,

# Functions for downloading databases
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,

# Functions for storing databases
store_database_for_eys_gene
)

# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_gnomad_api_data,
merge_gnomad_lovd,
parse_gnomad,
set_gnomad_dtypes,
)
1 change: 0 additions & 1 deletion api/data/downloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,3 @@ def store_database_for_eys_gene(database_name, override=False):
download_lovd_database_for_eys_gene(override)
else:
download_database_for_eys_gene(database_name, override)

71 changes: 43 additions & 28 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,9 @@ def lovd_fill_hg38(lovd: pd.DataFrame):
return
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA)
missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna()
lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
convert_hg19_if_missing)
lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = (lovd.loc[missing_hg38_mask,
'VariantOnGenome/DNA'].
apply(convert_hg19_if_missing))
lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen)


Expand Down Expand Up @@ -316,6 +317,38 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
f.write("\n")


def find_popmax_in_gnomad(data):
"""
Finds popmax in gnomad data
:param DataFrame data: Gnomad data.
"""

population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(data.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if data.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = data.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
data.loc[i, 'Popmax'] = max_pop
data.loc[i, 'Popmax population'] = population_mapping[max_id]


def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
Expand Down Expand Up @@ -412,44 +445,26 @@ def request_gnomad_api_data(gene_name):
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0)
+ df.loc[:, 'genome.ac_hom'].fillna(0))
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(len(exome_populations)):
for i in range(exome_populations.shape[0]):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}
(df.loc[:, f'exome_ac_{population_id}'].fillna(0)
+ df.loc[:, f'genome_ac_{population_id}'].fillna(0))
/ (df.loc[:, f'exome_an_{population_id}'].fillna(0)
+ df.loc[:, f'genome_an_{population_id}'].fillna(0)))

for i in range(df.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
df.loc[i, 'Popmax'] = max_pop
df.loc[i, 'Popmax population'] = population_mapping[max_id]
find_popmax_in_gnomad(df)
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change']

Expand Down
7 changes: 0 additions & 7 deletions api/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +0,0 @@
"""
This module provides access to the tools for fetching scores for genetic variants.
"""

from .revel.revel import (
get_revel_scores
)
9 changes: 6 additions & 3 deletions api/tools/revel/revel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
""" Retrieves REVEL scores for specific variants based on chromosome and position from a CSV file. """
"""
Retrieves REVEL scores for specific variants based on chromosomeand position from a CSV file.
"""


import pandas as pd
import os
import pandas as pd

current_script_dir = os.path.dirname(os.path.abspath(__file__))
revel_file = os.path.join(current_script_dir, 'revel_with_transcript_ids')
Expand All @@ -16,7 +19,7 @@ def get_revel_scores(chromosome, position):
"""
variants = []
revel_data = pd.read_csv(revel_file)

variants = revel_data[
(revel_data['chr'] == chromosome) &
(revel_data['hg19_pos'] == position)
Expand Down
Empty file added app/back-end/__init__.py
Empty file.
3 changes: 2 additions & 1 deletion app/back-end/src/events/workspace_export_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def handle_workspace_export_feedback(data):
if data["status"] == "success":
socketio_emit_to_user_session(
CONSOLE_FEEDBACK_EVENT,
{"type": "succ", "message": f"File '{data['filePath']}' export was completed successfully."},
{"type": "succ",
"message": f"File '{data['filePath']}' export was completed successfully."},
data["uuid"],
data["sid"],
)
Expand Down
4 changes: 4 additions & 0 deletions tests/hail-20240821-2020-0.2.132-678e1f52b999.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
2024-08-21 20:20:57.557 Hail: WARN: This Hail JAR was compiled for Spark 3.5.0, running with Spark 3.5.2.
Compatibility is not guaranteed.
2024-08-21 20:20:58.340 Hail: INFO: SparkUI: http://192.168.68.114:4040
2024-08-21 20:20:58.841 Hail: INFO: Running Hail version 0.2.132-678e1f52b999
4 changes: 4 additions & 0 deletions tests/lovd.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
##fileformat=VCFv4.2
##contig=<ID=6,length=63719980>
#CHROM POS ID REF ALT QUAL FILTER INFO
6 63789078 . A G . . .
6 changes: 6 additions & 0 deletions tests/lovd_output.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=6,length=63719980>
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
#CHROM POS ID REF ALT QUAL FILTER INFO
6 63789078 . A G . . SpliceAI=G|EYS|0.00|0.00|0.00|0.00|3|9|-20|9
2 changes: 1 addition & 1 deletion tests/test_lovd_fill_hg38.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_fill_hg38_no_variants(self):
lovd_fill_hg38(self.df)
self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")

def test_fill_hg38_NA_variants(self):
def test_fill_hg38_na_variants(self):
"""Test filling hg38 values when there are pd. NA variants in the dataframe."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': [pd.NA],
Expand Down
40 changes: 40 additions & 0 deletions tests/tools/spliceai/actual_output.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=20191004
##reference=GRCh37/hg19
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
#CHROM POS ID REF ALT QUAL FILTER INFO
1 25000 . A C,G,T . . .
2 152389953 . T A,C,G . . SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3
2 179415988 . C CA . . SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29
2 179446218 . ATACT A . . SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8
2 179446218 . ATACT AT,ATA . . SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.
2 179642185 . G A . . SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38
19 38958362 . C T . . SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31
21 47406854 . CCA C . . SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4
21 47406856 . A AT . . SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2
X 129274636 . A C,G,T . . SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45

0 comments on commit 3b909a9

Please sign in to comment.