From a3d59b3788e218dd179c9e5ef489256b271b6faa Mon Sep 17 00:00:00 2001
From: Josh Loecker <joshloecker@icloud.com>
Date: Wed, 8 Mar 2023 16:40:59 -0600
Subject: [PATCH] Use python 3.10

---
 environment.yaml             |   2 +-
 main/config.toml             |   2 +-
 main/py/project.py           | 201 +++++++++++++++++++++++++----------
 main/py/rnaseq_preprocess.py |  47 ++++----
 4 files changed, 169 insertions(+), 83 deletions(-)

diff --git a/environment.yaml b/environment.yaml
index 1bc0e55d..ff5f7b19 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -31,7 +31,7 @@ dependencies:
   - pandas==1.*
   - pytest==7.*
   - python-libsbml==5.*
-  - python==3.11.*
+  - python==3.10.*
   - r-biocmanager==1.*
   - r-devtools==2.*
   - r-ggrepel==0.*
diff --git a/main/config.toml b/main/config.toml
index 3439c818..7f59bef4 100644
--- a/main/config.toml
+++ b/main/config.toml
@@ -6,7 +6,7 @@ taxon_id = "human"                  # accepts a bioDBnet taxon id, "human", or "
 create_counts_matrix = true         # set to false if using a pregenerated matrix file
 gene_format = "Ensembl"             # accepts "Entrez", "Ensembl", and "Symbol"
 preprocess_mode = "provide-matrix"  # "create-matrix" or "provide-matrix"
-
+matrix_filename = ""                # This is required if preprocess_mode is "create-matrix"
 
 [rna_seq_generation]
 trnaseq_config_file = "trnaseq_data_inputs_auto.xlsx"
diff --git a/main/py/project.py b/main/py/project.py
index 4fc5fba9..ea73cfd3 100644
--- a/main/py/project.py
+++ b/main/py/project.py
@@ -1,11 +1,14 @@
 #!/usr/bin/python3
 
 import os
-import tomllib
+import toml
+from pathlib import Path
 from datetime import datetime
 from dataclasses import dataclass, field
 from typing import Literal
 
+from async_bioservices.input_database import InputDatabase
+
 @dataclass
 class general:
     taxon_id: str
@@ -14,13 +17,14 @@ class general:
 @dataclass
 class rnaseq_preprocess:
     create_counts_matrix: bool
-    gene_format: str
+    gene_format: InputDatabase
     preprocess_mode: Literal["provide-matrix", "create-matrix"]
+    matrix_filename: Path
     
 @dataclass
 class rna_seq_generation:
-    trnaseq_config_file: str
-    mrnaseq_config_file: str
+    trnaseq_config_filepath: Path
+    mrnaseq_config_filepath: Path
     technique: str
     rep_ratio: float
     group_ratio: float
@@ -31,7 +35,7 @@ class rna_seq_generation:
 
 @dataclass
 class proteomics_analysis:
-    proteomics_config_file: str
+    proteomics_config_file: Path
     rep_ratio: float
     batch_ratio: float
     high_rep_ratio: float
@@ -53,11 +57,11 @@ class model_creation:
     high_threshold: int
     output_filetypes: str
     objective_dict: dict
-    general_model_file: str
+    general_model_file: Path
     solver: str
-    boundary_reactions_filename: str
-    force_reactions_filename: str
-    exclude_reactions_filename: str
+    boundary_reactions_filepath: Path
+    force_reactions_filepath: Path
+    exclude_reactions_filepath: Path
     recon_algorithms: list[str] = field(default_factory=list[str])
 
 @dataclass
@@ -68,7 +72,7 @@ class disease_analysis:
 @dataclass
 class drug_repurposing:
     sovler: str
-    drug_raw_file: str
+    drug_raw_filepath: Path
     
 @dataclass
 class about:
@@ -79,8 +83,8 @@ def __post_init__(self):
         # Set self.date to todays date in the format of "Month Day, Year"
         self.date = datetime.now().strftime("%B %d, %Y")
         
-        # Get "VERSION" from the environment
-        self.version = os.environ["COMO_VERSION"]
+        # EXAMPLE: get "v1.0.0-BRANCH" from "refs/tags/v1.0.0-BRANCH"
+        self.version = os.environ["COMO_VERSION"].split("/")[-1]
 
 
 class Configs:
@@ -103,94 +107,173 @@ def __init__(self, projectdir):
         self.drug_repurposing: drug_repurposing = self._get_drug_repurposing()
         self.about: about = self._get_about()
 
+    def _read_from_toml(self):
+        toml_file: str = os.path.join(self.rootdir, "config.toml")
+        with open(toml_file, "rb") as i_stream:
+            data = toml.load(i_stream)
+        return data
+
     def _get_general(self) -> general:
+        taxon_id = self._toml_data["general"]["taxon_id"]
+        context_names: list[str] = self._toml_data["general"]["context_names"]
+        
+        if isinstance(taxon_id, str):
+            if taxon_id.lower() not in ["human", "mouse"]:
+                raise ValueError("The taxon_id setting under 'general' must be either 'human' or 'mouse'.\nPlease edit `config.toml`")
+        
+        if not isinstance(context_names, list):
+            raise ValueError("The context_names setting under 'general' must be a list, such as `['type1', 'type2']`.\nPlease edit `config.toml`")
+        
         data: general = general(
-            taxon_id=self._toml_data["general"]["taxon_id"],
-            context_names=self._toml_data["general"]["context_names"],
+            taxon_id=taxon_id,
+            context_names=context_names,
         )
         return data
 
     def _get_rnaseq_preprocess(self) -> rnaseq_preprocess:
+        create_counts_matrix = self._toml_data["rnaseq_preprocess"]["create_counts_matrix"]
+        gene_format = self._toml_data["rnaseq_preprocess"]["gene_format"]
+        preprocess_mode = self._toml_data["rnaseq_preprocess"]["preprocess_mode"]
+        matrix_filename = self._toml_data["rnaseq_preprocess"]["matrix_filename"]
+        
+        if not isinstance(create_counts_matrix, bool):
+            raise ValueError("The create_counts_matrix setting under 'rnaseq_preprocess' must be either 'true' or 'false'.\nPlease edit `config.toml`")
+        
+        if gene_format.lower() not in ["Entrez", "Ensembl", "Symbol"]:
+            raise ValueError("The gene_format setting under 'rnaseq_preprocess' must be either 'Entrez', 'Ensembl', or 'Symbol'.\nPlease edit `config.toml`")
+        else:
+            if gene_format.lower() in ["ensembl", "ensemble", "ensg", "ensmusg", "ensembl id", "ensembl gene id"]:
+                gene_format_database: InputDatabase = InputDatabase.ENSEMBL_GENE_ID
+            elif gene_format.lower() in ["hgnc symbol", "hugo", "hugo symbol", "symbol", "hgnc", "gene symbol"]:
+                gene_format_database: InputDatabase = InputDatabase.GENE_SYMBOL
+            elif gene_format.lower() in ["entrez", "entres", "entrez id", "entrez number" "gene id"]:
+                gene_format_database: InputDatabase = InputDatabase.GENE_ID
+        
+        if preprocess_mode.lower() not in ["provide-matrix", "create-matrix"]:
+            raise ValueError("The preprocess_mode setting under 'rnaseq_preprocess' must be either 'provide-matrix' or 'create-matrix'.\nPlease edit `config.toml`")
+        
+        if preprocess_mode.lower() == "create-matrix" and matrix_filename == "":
+            raise ValueError("The matrix_filename setting under 'rnaseq_preprocess' must be set if the preprocess_mode is set to 'create-matrix'.\nPlease edit `config.toml`")
+        
         data: rnaseq_preprocess = rnaseq_preprocess(
-            create_counts_matrix=self._toml_data["rnaseq_preprocess"]["create_counts_matrix"],
-            gene_format=self._toml_data["rnaseq_preprocess"]["gene_format"],
-            preprocess_mode=self._toml_data["rnaseq_preprocess"]["preprocess_mode"],
+            create_counts_matrix=create_counts_matrix,
+            gene_format=gene_format_database,
+            preprocess_mode=preprocess_mode,
+            matrix_filename=matrix_filename,
         )
         return data
 
     def _get_rna_seq_generation(self) -> rna_seq_generation:
+        trnaseq_config_filepath: Path = Path(self.configdir, self._toml_data["rna_seq_generation"]["trnaseq_config_file"])
+        mrnaseq_config_filepath: Path = Path(self.configdir, self._toml_data["rna_seq_generation"]["mrnaseq_config_file"])
+        technique = self._toml_data["rna_seq_generation"]["technique"]
+        rep_ratio = self._toml_data["rna_seq_generation"]["rep_ratio"]
+        group_ratio = self._toml_data["rna_seq_generation"]["group_ratio"]
+        rep_ratio_h = self._toml_data["rna_seq_generation"]["rep_ratio_h"]
+        group_ratio_h = self._toml_data["rna_seq_generation"]["group_ratio_h"]
+        quantile = self._toml_data["rna_seq_generation"]["quantile"]
+        min_zfpkm = self._toml_data["rna_seq_generation"]["min_zfpkm"]
+        
+        if technique.lower() not in ["quantile", "zfpkm", "cpm"]:
+            raise ValueError("The technique setting under 'rna_seq_generation' must be either 'quantile', 'zfpkm', or 'cpm'.\nPlease edit `config.toml`")
+        
         data: rna_seq_generation = rna_seq_generation(
-            trnaseq_config_file=self._toml_data["rna_seq_generation"]["trnaseq_config_file"],
-            mrnaseq_config_file=self._toml_data["rna_seq_generation"]["mrnaseq_config_file"],
-            technique=self._toml_data["rna_seq_generation"]["technique"],
-            rep_ratio=self._toml_data["rna_seq_generation"]["rep_ratio"],
-            group_ratio=self._toml_data["rna_seq_generation"]["group_ratio"],
-            rep_ratio_h=self._toml_data["rna_seq_generation"]["rep_ratio_h"],
-            group_ratio_h=self._toml_data["rna_seq_generation"]["group_ratio_h"],
-            quantile=self._toml_data["rna_seq_generation"]["quantile"],
-            min_zfpkm=self._toml_data["rna_seq_generation"]["min_zfpkm"],
+            trnaseq_config_filepath=trnaseq_config_filepath,
+            mrnaseq_config_filepath=mrnaseq_config_filepath,
+            technique=technique,
+            rep_ratio=rep_ratio,
+            group_ratio=group_ratio,
+            rep_ratio_h=rep_ratio_h,
+            group_ratio_h=group_ratio_h,
+            quantile=quantile,
+            min_zfpkm=min_zfpkm,
         )
         return data
 
     def _get_proteomics_analysis(self) -> proteomics_analysis:
+        proteomics_config_file: Path = Path(self.configdir, self._toml_data["proteomics_analysis"]["proteomics_config_file"])
+        rep_ratio = self._toml_data["proteomics_analysis"]["rep_ratio"]
+        batch_ratio = self._toml_data["proteomics_analysis"]["batch_ratio"]
+        high_rep_ratio = self._toml_data["proteomics_analysis"]["high_rep_ratio"]
+        high_batch_ratio = self._toml_data["proteomics_analysis"]["high_batch_ratio"]
+        quantile = self._toml_data["proteomics_analysis"]["quantile"]
+        
         data: proteomics_analysis = proteomics_analysis(
-            proteomics_config_file=self._toml_data["proteomics_analysis"]["proteomics_config_file"],
-            rep_ratio=self._toml_data["proteomics_analysis"]["rep_ratio"],
-            batch_ratio=self._toml_data["proteomics_analysis"]["batch_ratio"],
-            high_rep_ratio=self._toml_data["proteomics_analysis"]["high_rep_ratio"],
-            high_batch_ratio=self._toml_data["proteomics_analysis"]["high_batch_ratio"],
-            quantile=self._toml_data["proteomics_analysis"]["quantile"],
+            proteomics_config_file=proteomics_config_file,
+            rep_ratio=rep_ratio,
+            batch_ratio=batch_ratio,
+            high_rep_ratio=high_rep_ratio,
+            high_batch_ratio=high_batch_ratio,
+            quantile=quantile,
         )
         return data
 
     def _get_merge_xomics(self) -> merge_xomics:
+        expression_requirement = self._toml_data["merge_xomics"]["expression_requirement"]
+        requirement_adjust = self._toml_data["merge_xomics"]["requirement_adjust"]
+        total_rna_weight = self._toml_data["merge_xomics"]["total_rna_weight"]
+        mrna_weight = self._toml_data["merge_xomics"]["mrna_weight"]
+        single_cell_weight = self._toml_data["merge_xomics"]["single_cell_weight"]
+        proteomics_weight = self._toml_data["merge_xomics"]["proteomics_weight"]
+        
         data: merge_xomics = merge_xomics(
-            expression_requirement=self._toml_data["merge_xomics"]["expression_requirement"],
-            requirement_adjust=self._toml_data["merge_xomics"]["requirement_adjust"],
-            total_rna_weight=self._toml_data["merge_xomics"]["total_rna_weight"],
-            mrna_weight=self._toml_data["merge_xomics"]["mrna_weight"],
-            single_cell_weight=self._toml_data["merge_xomics"]["single_cell_weight"],
-            proteomics_weight=self._toml_data["merge_xomics"]["proteomics_weight"],
+            expression_requirement=expression_requirement,
+            requirement_adjust=requirement_adjust,
+            total_rna_weight=total_rna_weight,
+            mrna_weight=mrna_weight,
+            single_cell_weight=single_cell_weight,
+            proteomics_weight=proteomics_weight,
         )
         return data
 
     def _get_model_creation(self) -> model_creation:
+        low_threshold = self._toml_data["model_creation"]["low_threshold"]
+        high_threshold = self._toml_data["model_creation"]["high_threshold"]
+        output_filetypes = self._toml_data["model_creation"]["output_filetypes"]
+        objective_dict = self._toml_data["model_creation"]["objective_dict"]
+        general_model_file = self._toml_data["model_creation"]["general_model_file"]
+        solver = self._toml_data["model_creation"]["solver"]
+        boundary_reactions_filepath = self._toml_data["model_creation"]["boundary_reactions_filename"]
+        force_reactions_filepath = self._toml_data["model_creation"]["force_reactions_filename"]
+        exclude_reactions_filepath = self._toml_data["model_creation"]["exclude_reactions_filename"]
+        recon_algorithms = self._toml_data["model_creation"]["recon_algorithms"]
+        
         data: model_creation = model_creation(
-            low_threshold=self._toml_data["model_creation"]["low_threshold"],
-            high_threshold=self._toml_data["model_creation"]["high_threshold"],
-            output_filetypes=self._toml_data["model_creation"]["output_filetypes"],
-            objective_dict=self._toml_data["model_creation"]["objective_dict"],
-            general_model_file=self._toml_data["model_creation"]["general_model_file"],
-            solver=self._toml_data["model_creation"]["solver"],
-            boundary_reactions_filename=self._toml_data["model_creation"]["boundary_reactions_filename"],
-            force_reactions_filename=self._toml_data["model_creation"]["force_reactions_filename"],
-            exclude_reactions_filename=self._toml_data["model_creation"]["exclude_reactions_filename"],
-            recon_algorithms=self._toml_data["model_creation"]["recon_algorithms"],
+            low_threshold=low_threshold,
+            high_threshold=high_threshold,
+            output_filetypes=output_filetypes,
+            objective_dict=objective_dict,
+            general_model_file=general_model_file,
+            solver=solver,
+            boundary_reactions_filepath=boundary_reactions_filepath,
+            force_reactions_filepath=force_reactions_filepath,
+            exclude_reactions_filepath=exclude_reactions_filepath,
+            recon_algorithms=recon_algorithms,
         )
         return data
 
     def _get_disease_analysis(self) -> disease_analysis:
+        data_source = self._toml_data["disease_analysis"]["data_source"]
+        disease_names = self._toml_data["disease_analysis"]["disease_names"]
+        
         data: disease_analysis = disease_analysis(
-            data_source=self._toml_data["disease_analysis"]["data_source"],
-            disease_names=self._toml_data["disease_analysis"]["disease_names"],
+            data_source=data_source,
+            disease_names=disease_names,
         )
         return data
 
     def _get_drug_repurposing(self) -> drug_repurposing:
+        sovler = self._toml_data["drug_repurposing"]["sovler"]
+        drug_raw_filepath = self._toml_data["drug_repurposing"]["drug_raw_file"]
+        
         data: drug_repurposing = drug_repurposing(
-            sovler=self._toml_data["drug_repurposing"]["sovler"],
-            drug_raw_file=self._toml_data["drug_repurposing"]["drug_raw_file"],
+            sovler=sovler,
+            drug_raw_filepath=drug_raw_filepath,
         )
         return data
 
     def _get_about(self) -> about:
         return about()
-
-    def _read_from_toml(self):
-        toml_file: str = os.path.join(self.rootdir, "config.toml")
-        with open(toml_file, "rb") as i_stream:
-            data = tomllib.load(i_stream)
-        return data
     
 
 current_dir = os.getcwd()
@@ -210,3 +293,5 @@ def _read_from_toml(self):
 # Add leading "/", as it will not exist right now
 work_dir = os.path.join("/", work_dir)
 configs = Configs(work_dir)
+print(configs.about.version)
+print(configs.about.date)
diff --git a/main/py/rnaseq_preprocess.py b/main/py/rnaseq_preprocess.py
index d5c19273..69d912ae 100644
--- a/main/py/rnaseq_preprocess.py
+++ b/main/py/rnaseq_preprocess.py
@@ -2,7 +2,6 @@
 
 import pandas as pd
 
-from project import configs
 import re
 import os
 import sys
@@ -12,6 +11,7 @@
 import numpy as np
 from pathlib import Path
 
+from project import configs
 from async_bioservices import async_bioservices
 from async_bioservices.output_database import OutputDatabase
 from async_bioservices.input_database import InputDatabase
@@ -412,48 +412,49 @@ def parse_args(argv):
 
 
 def main(argv):
-    args = parse_args(argv)
+    context_names: list[str] = configs.general.context_names
+    gene_format: str = configs.rnaseq_preprocess.gene_format
+    taxon_id: str = configs.rnaseq_preprocess.taxon_id
+    preprocess_mode: str = configs.rnaseq_preprocess.preprocess_mode
+    matrix_filename: str = configs.rnaseq_preprocess.matrix_filename
 
-    if args.gene_format.upper() in ["ENSEMBL", "ENSEMBLE", "ENSG", "ENSMUSG", "ENSEMBL ID", "ENSEMBL GENE ID"]:
+    if gene_format.upper() in ["ENSEMBL", "ENSEMBLE", "ENSG", "ENSMUSG", "ENSEMBL ID", "ENSEMBL GENE ID"]:
         gene_format_database: InputDatabase = InputDatabase.ENSEMBL_GENE_ID
 
-    elif args.gene_format.upper() in ["HGNC SYMBOL", "HUGO", "HUGO SYMBOL", "SYMBOL", "HGNC", "GENE SYMBOL"]:
+    elif gene_format.upper() in ["HGNC SYMBOL", "HUGO", "HUGO SYMBOL", "SYMBOL", "HGNC", "GENE SYMBOL"]:
         gene_format_database: InputDatabase = InputDatabase.GENE_SYMBOL
 
-    elif args.gene_format.upper() in ["ENTREZ", "ENTRES", "ENTREZ ID", "ENTREZ NUMBER" "GENE ID"]:
+    elif gene_format.upper() in ["ENTREZ", "ENTRES", "ENTREZ ID", "ENTREZ NUMBER" "GENE ID"]:
         gene_format_database: InputDatabase = InputDatabase.GENE_ID
 
     else:  # provided invalid gene format
         print("Gene format (--gene_format) is invalid")
         print("Accepts 'Ensembl', 'Entrez', and 'HGNC symbol'")
-        print(f"You provided: {args.gene_format}")
+        print(f"You provided: {gene_format}")
         sys.exit()
 
     # handle species alternative ids
-    if type(args.taxon_id) == str:
-        if args.taxon_id.upper() == "HUMAN" or args.taxon_id.upper() == "HOMO SAPIENS":
-            taxon_id = TaxonIDs.HOMO_SAPIENS
-        elif args.taxon_id.upper() == "MOUSE" or args.taxon_id.upper() == "MUS MUSCULUS":
-            taxon_id = TaxonIDs.MUS_MUSCULUS
+    if isinstance(taxon_id, str):
+        if taxon_id.upper() == "HUMAN" or taxon_id.upper() == "HOMO SAPIENS":
+            taxon_id: TaxonIDs = TaxonIDs.HOMO_SAPIENS
+        elif taxon_id.upper() == "MOUSE" or taxon_id.upper() == "MUS MUSCULUS":
+            taxon_id: TaxonIDs = TaxonIDs.MUS_MUSCULUS
         else:
             print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')")
             sys.exit(1)
-    elif type(args.taxon_id) == int:
-        taxon_id = args.taxon_id
+    elif isinstance(taxon_id, int):
+        taxon_id: int = int(taxon_id)
     else:
         print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')")
         sys.exit(1)
 
-    # use mutually exclusive flag to set mode which tells which files to generate
-    if args.provide_matrix:
-        mode = "provide"
-    elif args.make_matrix:
-        mode = "make"
-    else:
-        print("--provide-matrix or --create-matrix must be set")
-        sys.exit(1)
-
-    handle_context_batch(args.context_names, mode, gene_format_database, taxon_id, args.provided_matrix_fname)
+    handle_context_batch(
+        context_names=context_names,
+        mode=preprocess_mode,
+        form=gene_format_database,
+        taxon_id=taxon_id,
+        provided_matrix_file=matrix_filename
+    )
 
 
 if __name__ == "__main__":