From a3d59b3788e218dd179c9e5ef489256b271b6faa Mon Sep 17 00:00:00 2001 From: Josh Loecker Date: Wed, 8 Mar 2023 16:40:59 -0600 Subject: [PATCH] Use python 3.10 --- environment.yaml | 2 +- main/config.toml | 2 +- main/py/project.py | 201 +++++++++++++++++++++++++---------- main/py/rnaseq_preprocess.py | 47 ++++---- 4 files changed, 169 insertions(+), 83 deletions(-) diff --git a/environment.yaml b/environment.yaml index 1bc0e55d..ff5f7b19 100644 --- a/environment.yaml +++ b/environment.yaml @@ -31,7 +31,7 @@ dependencies: - pandas==1.* - pytest==7.* - python-libsbml==5.* - - python==3.11.* + - python==3.10.* - r-biocmanager==1.* - r-devtools==2.* - r-ggrepel==0.* diff --git a/main/config.toml b/main/config.toml index 3439c818..7f59bef4 100644 --- a/main/config.toml +++ b/main/config.toml @@ -6,7 +6,7 @@ taxon_id = "human" # accepts a bioDBnet taxon id, "human", or " create_counts_matrix = true # set to false if using a pregenerated matrix file gene_format = "Ensembl" # accepts "Entrez", "Ensembl", and "Symbol" preprocess_mode = "provide-matrix" # "create-matrix" or "provide-matrix" - +matrix_filename = "" # This is required if preprocess_mode is "create-matrix" [rna_seq_generation] trnaseq_config_file = "trnaseq_data_inputs_auto.xlsx" diff --git a/main/py/project.py b/main/py/project.py index 4fc5fba9..ea73cfd3 100644 --- a/main/py/project.py +++ b/main/py/project.py @@ -1,11 +1,14 @@ #!/usr/bin/python3 import os -import tomllib +import toml +from pathlib import Path from datetime import datetime from dataclasses import dataclass, field from typing import Literal +from async_bioservices.input_database import InputDatabase + @dataclass class general: taxon_id: str @@ -14,13 +17,14 @@ class general: @dataclass class rnaseq_preprocess: create_counts_matrix: bool - gene_format: str + gene_format: InputDatabase preprocess_mode: Literal["provide-matrix", "create-matrix"] + matrix_filename: Path @dataclass class rna_seq_generation: - trnaseq_config_file: str - mrnaseq_config_file: str + trnaseq_config_filepath: Path + mrnaseq_config_filepath: Path technique: str rep_ratio: float group_ratio: float @@ -31,7 +35,7 @@ class rna_seq_generation: @dataclass class proteomics_analysis: - proteomics_config_file: str + proteomics_config_file: Path rep_ratio: float batch_ratio: float high_rep_ratio: float @@ -53,11 +57,11 @@ class model_creation: high_threshold: int output_filetypes: str objective_dict: dict - general_model_file: str + general_model_file: Path solver: str - boundary_reactions_filename: str - force_reactions_filename: str - exclude_reactions_filename: str + boundary_reactions_filepath: Path + force_reactions_filepath: Path + exclude_reactions_filepath: Path recon_algorithms: list[str] = field(default_factory=list[str]) @dataclass @@ -68,7 +72,7 @@ class disease_analysis: @dataclass class drug_repurposing: sovler: str - drug_raw_file: str + drug_raw_filepath: Path @dataclass class about: @@ -79,8 +83,8 @@ def __post_init__(self): # Set self.date to todays date in the format of "Month Day, Year" self.date = datetime.now().strftime("%B %d, %Y") - # Get "VERSION" from the environment - self.version = os.environ["COMO_VERSION"] + # EXAMPLE: get "v1.0.0-BRANCH" from "refs/tags/v1.0.0-BRANCH" + self.version = os.environ["COMO_VERSION"].split("/")[-1] class Configs: @@ -103,94 +107,173 @@ def __init__(self, projectdir): self.drug_repurposing: drug_repurposing = self._get_drug_repurposing() self.about: about = self._get_about() + def _read_from_toml(self): + toml_file: str = os.path.join(self.rootdir, "config.toml") + with open(toml_file, "rb") as i_stream: + data = toml.load(i_stream) + return data + def _get_general(self) -> general: + taxon_id = self._toml_data["general"]["taxon_id"] + context_names: list[str] = self._toml_data["general"]["context_names"] + + if isinstance(taxon_id, str): + if taxon_id.lower() not in ["human", "mouse"]: + raise ValueError("The taxon_id setting under 'general' must be either 'human' or 'mouse'.\nPlease edit `config.toml`") + + if not isinstance(context_names, list): + raise ValueError("The context_names setting under 'general' must be a list, such as `['type1', 'type2']`.\nPlease edit `config.toml`") + data: general = general( - taxon_id=self._toml_data["general"]["taxon_id"], - context_names=self._toml_data["general"]["context_names"], + taxon_id=taxon_id, + context_names=context_names, ) return data def _get_rnaseq_preprocess(self) -> rnaseq_preprocess: + create_counts_matrix = self._toml_data["rnaseq_preprocess"]["create_counts_matrix"] + gene_format = self._toml_data["rnaseq_preprocess"]["gene_format"] + preprocess_mode = self._toml_data["rnaseq_preprocess"]["preprocess_mode"] + matrix_filename = self._toml_data["rnaseq_preprocess"]["matrix_filename"] + + if not isinstance(create_counts_matrix, bool): + raise ValueError("The create_counts_matrix setting under 'rnaseq_preprocess' must be either 'true' or 'false'.\nPlease edit `config.toml`") + + if gene_format.lower() not in ["Entrez", "Ensembl", "Symbol"]: + raise ValueError("The gene_format setting under 'rnaseq_preprocess' must be either 'Entrez', 'Ensembl', or 'Symbol'.\nPlease edit `config.toml`") + else: + if gene_format.lower() in ["ensembl", "ensemble", "ensg", "ensmusg", "ensembl id", "ensembl gene id"]: + gene_format_database: InputDatabase = InputDatabase.ENSEMBL_GENE_ID + elif gene_format.lower() in ["hgnc symbol", "hugo", "hugo symbol", "symbol", "hgnc", "gene symbol"]: + gene_format_database: InputDatabase = InputDatabase.GENE_SYMBOL + elif gene_format.lower() in ["entrez", "entres", "entrez id", "entrez number" "gene id"]: + gene_format_database: InputDatabase = InputDatabase.GENE_ID + + if preprocess_mode.lower() not in ["provide-matrix", "create-matrix"]: + raise ValueError("The preprocess_mode setting under 'rnaseq_preprocess' must be either 'provide-matrix' or 'create-matrix'.\nPlease edit `config.toml`") + + if preprocess_mode.lower() == "create-matrix" and matrix_filename == "": + raise ValueError("The matrix_filename setting under 'rnaseq_preprocess' must be set if the preprocess_mode is set to 'create-matrix'.\nPlease edit `config.toml`") + data: rnaseq_preprocess = rnaseq_preprocess( - create_counts_matrix=self._toml_data["rnaseq_preprocess"]["create_counts_matrix"], - gene_format=self._toml_data["rnaseq_preprocess"]["gene_format"], - preprocess_mode=self._toml_data["rnaseq_preprocess"]["preprocess_mode"], + create_counts_matrix=create_counts_matrix, + gene_format=gene_format_database, + preprocess_mode=preprocess_mode, + matrix_filename=matrix_filename, ) return data def _get_rna_seq_generation(self) -> rna_seq_generation: + trnaseq_config_filepath: Path = Path(self.configdir, self._toml_data["rna_seq_generation"]["trnaseq_config_file"]) + mrnaseq_config_filepath: Path = Path(self.configdir, self._toml_data["rna_seq_generation"]["mrnaseq_config_file"]) + technique = self._toml_data["rna_seq_generation"]["technique"] + rep_ratio = self._toml_data["rna_seq_generation"]["rep_ratio"] + group_ratio = self._toml_data["rna_seq_generation"]["group_ratio"] + rep_ratio_h = self._toml_data["rna_seq_generation"]["rep_ratio_h"] + group_ratio_h = self._toml_data["rna_seq_generation"]["group_ratio_h"] + quantile = self._toml_data["rna_seq_generation"]["quantile"] + min_zfpkm = self._toml_data["rna_seq_generation"]["min_zfpkm"] + + if technique.lower() not in ["quantile", "zfpkm", "cpm"]: + raise ValueError("The technique setting under 'rna_seq_generation' must be either 'quantile', 'zfpkm', or 'cpm'.\nPlease edit `config.toml`") + data: rna_seq_generation = rna_seq_generation( - trnaseq_config_file=self._toml_data["rna_seq_generation"]["trnaseq_config_file"], - mrnaseq_config_file=self._toml_data["rna_seq_generation"]["mrnaseq_config_file"], - technique=self._toml_data["rna_seq_generation"]["technique"], - rep_ratio=self._toml_data["rna_seq_generation"]["rep_ratio"], - group_ratio=self._toml_data["rna_seq_generation"]["group_ratio"], - rep_ratio_h=self._toml_data["rna_seq_generation"]["rep_ratio_h"], - group_ratio_h=self._toml_data["rna_seq_generation"]["group_ratio_h"], - quantile=self._toml_data["rna_seq_generation"]["quantile"], - min_zfpkm=self._toml_data["rna_seq_generation"]["min_zfpkm"], + trnaseq_config_filepath=trnaseq_config_filepath, + mrnaseq_config_filepath=mrnaseq_config_filepath, + technique=technique, + rep_ratio=rep_ratio, + group_ratio=group_ratio, + rep_ratio_h=rep_ratio_h, + group_ratio_h=group_ratio_h, + quantile=quantile, + min_zfpkm=min_zfpkm, ) return data def _get_proteomics_analysis(self) -> proteomics_analysis: + proteomics_config_file: Path = Path(self.configdir, self._toml_data["proteomics_analysis"]["proteomics_config_file"]) + rep_ratio = self._toml_data["proteomics_analysis"]["rep_ratio"] + batch_ratio = self._toml_data["proteomics_analysis"]["batch_ratio"] + high_rep_ratio = self._toml_data["proteomics_analysis"]["high_rep_ratio"] + high_batch_ratio = self._toml_data["proteomics_analysis"]["high_batch_ratio"] + quantile = self._toml_data["proteomics_analysis"]["quantile"] + data: proteomics_analysis = proteomics_analysis( - proteomics_config_file=self._toml_data["proteomics_analysis"]["proteomics_config_file"], - rep_ratio=self._toml_data["proteomics_analysis"]["rep_ratio"], - batch_ratio=self._toml_data["proteomics_analysis"]["batch_ratio"], - high_rep_ratio=self._toml_data["proteomics_analysis"]["high_rep_ratio"], - high_batch_ratio=self._toml_data["proteomics_analysis"]["high_batch_ratio"], - quantile=self._toml_data["proteomics_analysis"]["quantile"], + proteomics_config_file=proteomics_config_file, + rep_ratio=rep_ratio, + batch_ratio=batch_ratio, + high_rep_ratio=high_rep_ratio, + high_batch_ratio=high_batch_ratio, + quantile=quantile, ) return data def _get_merge_xomics(self) -> merge_xomics: + expression_requirement = self._toml_data["merge_xomics"]["expression_requirement"] + requirement_adjust = self._toml_data["merge_xomics"]["requirement_adjust"] + total_rna_weight = self._toml_data["merge_xomics"]["total_rna_weight"] + mrna_weight = self._toml_data["merge_xomics"]["mrna_weight"] + single_cell_weight = self._toml_data["merge_xomics"]["single_cell_weight"] + proteomics_weight = self._toml_data["merge_xomics"]["proteomics_weight"] + data: merge_xomics = merge_xomics( - expression_requirement=self._toml_data["merge_xomics"]["expression_requirement"], - requirement_adjust=self._toml_data["merge_xomics"]["requirement_adjust"], - total_rna_weight=self._toml_data["merge_xomics"]["total_rna_weight"], - mrna_weight=self._toml_data["merge_xomics"]["mrna_weight"], - single_cell_weight=self._toml_data["merge_xomics"]["single_cell_weight"], - proteomics_weight=self._toml_data["merge_xomics"]["proteomics_weight"], + expression_requirement=expression_requirement, + requirement_adjust=requirement_adjust, + total_rna_weight=total_rna_weight, + mrna_weight=mrna_weight, + single_cell_weight=single_cell_weight, + proteomics_weight=proteomics_weight, ) return data def _get_model_creation(self) -> model_creation: + low_threshold = self._toml_data["model_creation"]["low_threshold"] + high_threshold = self._toml_data["model_creation"]["high_threshold"] + output_filetypes = self._toml_data["model_creation"]["output_filetypes"] + objective_dict = self._toml_data["model_creation"]["objective_dict"] + general_model_file = self._toml_data["model_creation"]["general_model_file"] + solver = self._toml_data["model_creation"]["solver"] + boundary_reactions_filepath = self._toml_data["model_creation"]["boundary_reactions_filename"] + force_reactions_filepath = self._toml_data["model_creation"]["force_reactions_filename"] + exclude_reactions_filepath = self._toml_data["model_creation"]["exclude_reactions_filename"] + recon_algorithms = self._toml_data["model_creation"]["recon_algorithms"] + data: model_creation = model_creation( - low_threshold=self._toml_data["model_creation"]["low_threshold"], - high_threshold=self._toml_data["model_creation"]["high_threshold"], - output_filetypes=self._toml_data["model_creation"]["output_filetypes"], - objective_dict=self._toml_data["model_creation"]["objective_dict"], - general_model_file=self._toml_data["model_creation"]["general_model_file"], - solver=self._toml_data["model_creation"]["solver"], - boundary_reactions_filename=self._toml_data["model_creation"]["boundary_reactions_filename"], - force_reactions_filename=self._toml_data["model_creation"]["force_reactions_filename"], - exclude_reactions_filename=self._toml_data["model_creation"]["exclude_reactions_filename"], - recon_algorithms=self._toml_data["model_creation"]["recon_algorithms"], + low_threshold=low_threshold, + high_threshold=high_threshold, + output_filetypes=output_filetypes, + objective_dict=objective_dict, + general_model_file=general_model_file, + solver=solver, + boundary_reactions_filepath=boundary_reactions_filepath, + force_reactions_filepath=force_reactions_filepath, + exclude_reactions_filepath=exclude_reactions_filepath, + recon_algorithms=recon_algorithms, ) return data def _get_disease_analysis(self) -> disease_analysis: + data_source = self._toml_data["disease_analysis"]["data_source"] + disease_names = self._toml_data["disease_analysis"]["disease_names"] + data: disease_analysis = disease_analysis( - data_source=self._toml_data["disease_analysis"]["data_source"], - disease_names=self._toml_data["disease_analysis"]["disease_names"], + data_source=data_source, + disease_names=disease_names, ) return data def _get_drug_repurposing(self) -> drug_repurposing: + sovler = self._toml_data["drug_repurposing"]["sovler"] + drug_raw_filepath = self._toml_data["drug_repurposing"]["drug_raw_file"] + data: drug_repurposing = drug_repurposing( - sovler=self._toml_data["drug_repurposing"]["sovler"], - drug_raw_file=self._toml_data["drug_repurposing"]["drug_raw_file"], + sovler=sovler, + drug_raw_filepath=drug_raw_filepath, ) return data def _get_about(self) -> about: return about() - - def _read_from_toml(self): - toml_file: str = os.path.join(self.rootdir, "config.toml") - with open(toml_file, "rb") as i_stream: - data = tomllib.load(i_stream) - return data current_dir = os.getcwd() @@ -210,3 +293,5 @@ def _read_from_toml(self): # Add leading "/", as it will not exist right now work_dir = os.path.join("/", work_dir) configs = Configs(work_dir) +print(configs.about.version) +print(configs.about.date) diff --git a/main/py/rnaseq_preprocess.py b/main/py/rnaseq_preprocess.py index d5c19273..69d912ae 100644 --- a/main/py/rnaseq_preprocess.py +++ b/main/py/rnaseq_preprocess.py @@ -2,7 +2,6 @@ import pandas as pd -from project import configs import re import os import sys @@ -12,6 +11,7 @@ import numpy as np from pathlib import Path +from project import configs from async_bioservices import async_bioservices from async_bioservices.output_database import OutputDatabase from async_bioservices.input_database import InputDatabase @@ -412,48 +412,49 @@ def parse_args(argv): def main(argv): - args = parse_args(argv) + context_names: list[str] = configs.general.context_names + gene_format: str = configs.rnaseq_preprocess.gene_format + taxon_id: str = configs.rnaseq_preprocess.taxon_id + preprocess_mode: str = configs.rnaseq_preprocess.preprocess_mode + matrix_filename: str = configs.rnaseq_preprocess.matrix_filename - if args.gene_format.upper() in ["ENSEMBL", "ENSEMBLE", "ENSG", "ENSMUSG", "ENSEMBL ID", "ENSEMBL GENE ID"]: + if gene_format.upper() in ["ENSEMBL", "ENSEMBLE", "ENSG", "ENSMUSG", "ENSEMBL ID", "ENSEMBL GENE ID"]: gene_format_database: InputDatabase = InputDatabase.ENSEMBL_GENE_ID - elif args.gene_format.upper() in ["HGNC SYMBOL", "HUGO", "HUGO SYMBOL", "SYMBOL", "HGNC", "GENE SYMBOL"]: + elif gene_format.upper() in ["HGNC SYMBOL", "HUGO", "HUGO SYMBOL", "SYMBOL", "HGNC", "GENE SYMBOL"]: gene_format_database: InputDatabase = InputDatabase.GENE_SYMBOL - elif args.gene_format.upper() in ["ENTREZ", "ENTRES", "ENTREZ ID", "ENTREZ NUMBER" "GENE ID"]: + elif gene_format.upper() in ["ENTREZ", "ENTRES", "ENTREZ ID", "ENTREZ NUMBER" "GENE ID"]: gene_format_database: InputDatabase = InputDatabase.GENE_ID else: # provided invalid gene format print("Gene format (--gene_format) is invalid") print("Accepts 'Ensembl', 'Entrez', and 'HGNC symbol'") - print(f"You provided: {args.gene_format}") + print(f"You provided: {gene_format}") sys.exit() # handle species alternative ids - if type(args.taxon_id) == str: - if args.taxon_id.upper() == "HUMAN" or args.taxon_id.upper() == "HOMO SAPIENS": - taxon_id = TaxonIDs.HOMO_SAPIENS - elif args.taxon_id.upper() == "MOUSE" or args.taxon_id.upper() == "MUS MUSCULUS": - taxon_id = TaxonIDs.MUS_MUSCULUS + if isinstance(taxon_id, str): + if taxon_id.upper() == "HUMAN" or taxon_id.upper() == "HOMO SAPIENS": + taxon_id: TaxonIDs = TaxonIDs.HOMO_SAPIENS + elif taxon_id.upper() == "MOUSE" or taxon_id.upper() == "MUS MUSCULUS": + taxon_id: TaxonIDs = TaxonIDs.MUS_MUSCULUS else: print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')") sys.exit(1) - elif type(args.taxon_id) == int: - taxon_id = args.taxon_id + elif isinstance(taxon_id, int): + taxon_id: int = int(taxon_id) else: print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')") sys.exit(1) - # use mutually exclusive flag to set mode which tells which files to generate - if args.provide_matrix: - mode = "provide" - elif args.make_matrix: - mode = "make" - else: - print("--provide-matrix or --create-matrix must be set") - sys.exit(1) - - handle_context_batch(args.context_names, mode, gene_format_database, taxon_id, args.provided_matrix_fname) + handle_context_batch( + context_names=context_names, + mode=preprocess_mode, + form=gene_format_database, + taxon_id=taxon_id, + provided_matrix_file=matrix_filename + ) if __name__ == "__main__":