improving pylint score

Strexas · Sep 15, 2024 · 3b909a9 · 3b909a9
1 parent 31f85fc
commit 3b909a9
Show file tree

Hide file tree

Showing 13 changed files with 107 additions and 107 deletions.
diff --git a/api/__init__.py b/api/__init__.py
@@ -32,7 +32,7 @@
 )
 
 # DATA COLLECTION IMPORT
-from .data.collection import (
+from .data.downloading import (
     # Custom exceptions
     BadResponseException,
     DownloadError,

diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -1,65 +0,0 @@
-"""
-Package for data collection purposes provides both collection and refactoring functionality.
-
-Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
-ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
-
-All necessary functionality can be imported directly from data without
-specifying the module.
-
-data collection pipeline example is established for project's specific usage.
-"""
-
-# CONSTANTS IMPORT
-from .constants import (
-  # URLs for LOVD database
-  LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,
-
-  # URLs for gnomAD database
-  GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,
-
-  # URLs for ClinVar database
-  CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,
-
-  # Paths for data storage
-  DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,
-
-  # Data types for tables
-  LOVD_TABLES_DATA_TYPES,
-
-  # Paths for database downloads
-  DATABASES_DOWNLOAD_PATHS,
-
-  GNOMAD_PATH,
-)
-
-# DATA COLLECTION IMPORT
-from .collection import (
-    # Custom exceptions
-    BadResponseException,
-    DownloadError,
-
-    # Custom utility functions
-    get_file_from_url,
-
-    # Functions for downloading databases
-    download_lovd_database_for_eys_gene,
-    download_genes_lovd,
-    download_database_for_eys_gene,
-
-    # Functions for storing databases
-    store_database_for_eys_gene
-)
-
-# DATA REFACTORING IMPORT
-from .refactoring import (
-    # Functions for refactoring data
-    set_lovd_dtypes,
-    parse_lovd,
-    from_clinvar_name_to_cdna_position,
-    save_lovd_as_vcf,
-    request_gnomad_api_data,
-    merge_gnomad_lovd,
-    parse_gnomad,
-    set_gnomad_dtypes,
-)

diff --git a/api/data/downloading.py b/api/data/downloading.py
@@ -188,4 +188,3 @@ def store_database_for_eys_gene(database_name, override=False):
         download_lovd_database_for_eys_gene(override)
     else:
         download_database_for_eys_gene(database_name, override)
-
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -197,8 +197,9 @@ def lovd_fill_hg38(lovd: pd.DataFrame):
         return
     lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA)
     missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna()
-    lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
-        convert_hg19_if_missing)
+    lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = (lovd.loc[missing_hg38_mask,
+                                                                'VariantOnGenome/DNA'].
+                                                         apply(convert_hg19_if_missing))
     lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen)
 
 
@@ -316,6 +317,38 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
             f.write("\n")
 
 
+def find_popmax_in_gnomad(data):
+    """
+    Finds popmax in gnomad data
+    :param DataFrame data: Gnomad data.
+    """
+
+    population_mapping = {
+            'afr': 'African/African American',
+            'eas': 'East Asian',
+            'asj': 'Ashkenazi Jew',
+            'sas': 'South Asian',
+            'nfe': 'European (non-Finnish)',
+            'fin': 'European (Finnish)',
+            'mid': 'Middle Eastern',
+            'amr': 'Admixed American',
+            'ami': "Amish",
+            'remaining': 'Remaining',
+            '': ''
+        }
+    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+
+    for i in range(data.shape[0]):
+        max_pop = 0
+        max_id = ''
+        for population_id in population_ids:
+            if data.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
+                max_pop = data.loc[i, f'Allele_Frequency_{population_id}']
+                max_id = population_id
+        data.loc[i, 'Popmax'] = max_pop
+        data.loc[i, 'Popmax population'] = population_mapping[max_id]
+
+
 def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
     """
     prepares the calculation of popmax and popmax population for a variant.
@@ -412,44 +445,26 @@ def request_gnomad_api_data(gene_name):
     df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
 
     df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
-    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
+    df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0)
+                                     + df.loc[:, 'genome.ac_hom'].fillna(0))
     exome_populations = df.loc[:, 'exome.populations']
     genome_populations = df.loc[:, 'genome.populations']
     population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
 
-    for i in range(len(exome_populations)):
+    for i in range(exome_populations.shape[0]):
         exome_pop = exome_populations[i]
         prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
         genome_pop = genome_populations[i]
         prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
 
     for population_id in population_ids:
         df.loc[:, f'Allele_Frequency_{population_id}'] = (
-               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
-                df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
-    population_mapping = {
-            'afr': 'African/African American',
-            'eas': 'East Asian',
-            'asj': 'Ashkenazi Jew',
-            'sas': 'South Asian',
-            'nfe': 'European (non-Finnish)',
-            'fin': 'European (Finnish)',
-            'mid': 'Middle Eastern',
-            'amr': 'Admixed American',
-            'ami': "Amish",
-            'remaining': 'Remaining',
-            '': ''
-        }
+               (df.loc[:, f'exome_ac_{population_id}'].fillna(0)
+                + df.loc[:, f'genome_ac_{population_id}'].fillna(0))
+               / (df.loc[:, f'exome_an_{population_id}'].fillna(0)
+                  + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
 
-    for i in range(df.shape[0]):
-        max_pop = 0
-        max_id = ''
-        for population_id in population_ids:
-            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
-                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
-                max_id = population_id
-        df.loc[i, 'Popmax'] = max_pop
-        df.loc[i, 'Popmax population'] = population_mapping[max_id]
+    find_popmax_in_gnomad(df)
     not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
                    'variant_id', 'cDNA change', 'Protein change']
 

diff --git a/api/tools/__init__.py b/api/tools/__init__.py
@@ -1,7 +0,0 @@
-"""
-This module provides access to the tools for fetching scores for genetic variants.
-"""
-
-from .revel.revel import (
-    get_revel_scores
-)

diff --git a/api/tools/revel/revel.py b/api/tools/revel/revel.py
@@ -1,7 +1,10 @@
-""" Retrieves REVEL scores for specific variants based on chromosome and position from a CSV file. """
+"""
+Retrieves REVEL scores for specific variants based on chromosomeand position from a CSV file.
+"""
+
 
-import pandas as pd
 import os
+import pandas as pd
 
 current_script_dir = os.path.dirname(os.path.abspath(__file__))
 revel_file = os.path.join(current_script_dir, 'revel_with_transcript_ids')
@@ -16,7 +19,7 @@ def get_revel_scores(chromosome, position):
     """
     variants = []
     revel_data = pd.read_csv(revel_file)
-    
+
     variants = revel_data[
         (revel_data['chr'] == chromosome) &
         (revel_data['hg19_pos'] == position)

diff --git a/app/back-end/__init__.py b/app/back-end/__init__.py
diff --git a/app/back-end/src/events/workspace_export_event.py b/app/back-end/src/events/workspace_export_event.py
@@ -69,7 +69,8 @@ def handle_workspace_export_feedback(data):
         if data["status"] == "success":
             socketio_emit_to_user_session(
                 CONSOLE_FEEDBACK_EVENT,
-                {"type": "succ", "message": f"File '{data['filePath']}' export was completed successfully."},
+                {"type": "succ",
+                 "message": f"File '{data['filePath']}' export was completed successfully."},
                 data["uuid"],
                 data["sid"],
             )

diff --git a/tests/hail-20240821-2020-0.2.132-678e1f52b999.log b/tests/hail-20240821-2020-0.2.132-678e1f52b999.log
@@ -0,0 +1,4 @@
+2024-08-21 20:20:57.557 Hail: WARN: This Hail JAR was compiled for Spark 3.5.0, running with Spark 3.5.2.
+  Compatibility is not guaranteed.
+2024-08-21 20:20:58.340 Hail: INFO: SparkUI: http://192.168.68.114:4040
+2024-08-21 20:20:58.841 Hail: INFO: Running Hail version 0.2.132-678e1f52b999
diff --git a/tests/lovd.vcf b/tests/lovd.vcf
@@ -0,0 +1,4 @@
+##fileformat=VCFv4.2
+##contig=<ID=6,length=63719980>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+6	63789078	.	A	G	.	.	.
diff --git a/tests/lovd_output.vcf b/tests/lovd_output.vcf
@@ -0,0 +1,6 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##contig=<ID=6,length=63719980>
+##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+6	63789078	.	A	G	.	.	SpliceAI=G|EYS|0.00|0.00|0.00|0.00|3|9|-20|9
diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py
@@ -88,7 +88,7 @@ def test_fill_hg38_no_variants(self):
         lovd_fill_hg38(self.df)
         self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")
 
-    def test_fill_hg38_NA_variants(self):
+    def test_fill_hg38_na_variants(self):
         """Test filling hg38 values when there are pd. NA variants in the dataframe."""
         self.df = pd.DataFrame({
             'VariantOnGenome/DNA': [pd.NA],

diff --git a/tests/tools/spliceai/actual_output.vcf b/tests/tools/spliceai/actual_output.vcf
@@ -0,0 +1,40 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##fileDate=20191004
+##reference=GRCh37/hg19
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3.1 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+1	25000	.	A	C,G,T	.	.	.
+2	152389953	.	T	A,C,G	.	.	SpliceAI=A|NEB|0.01|0.00|0.00|0.74|43|3|-26|3,C|NEB|0.04|0.00|0.00|0.71|43|3|-26|3,G|NEB|0.03|0.00|0.00|0.75|43|3|-26|3
+2	179415988	.	C	CA	.	.	SpliceAI=CA|TTN|0.07|1.00|0.00|0.00|-7|-1|35|-29
+2	179446218	.	ATACT	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.02|0.91|-7|34|-11|8
+2	179446218	.	ATACT	AT,ATA	.	.	SpliceAI=AT|TTN|.|.|.|.|.|.|.|.,ATA|TTN|.|.|.|.|.|.|.|.
+2	179642185	.	G	A	.	.	SpliceAI=A|TTN|0.00|0.00|0.64|0.55|2|38|2|-38
+19	38958362	.	C	T	.	.	SpliceAI=T|RYR1|0.00|0.00|0.91|0.08|-28|-46|-2|-31
+21	47406854	.	CCA	C	.	.	SpliceAI=C|COL6A1|0.04|0.98|0.00|0.00|-38|4|38|4
+21	47406856	.	A	AT	.	.	SpliceAI=AT|COL6A1|0.03|0.99|0.00|0.00|-40|2|36|2
+X	129274636	.	A	C,G,T	.	.	SpliceAI=C|AIFM1|0.00|0.18|0.00|0.00|-28|-44|-44|45,G|AIFM1|0.00|0.17|0.00|0.00|-8|-44|-44|45,T|AIFM1|0.00|0.19|0.00|0.00|-2|-44|-44|45
Original file line number	Diff line number	Diff line change
Expand Up		@@ -188,4 +188,3 @@ def store_database_for_eys_gene(database_name, override=False):
		download_lovd_database_for_eys_gene(override)
		else:
		download_database_for_eys_gene(database_name, override)