ecolicore tam is feasible. Added and-or relationships

iAMB-RWTH-Aachen · Feb 27, 2024 · 8c1b5fd · 8c1b5fd
1 parent 9822364
commit 8c1b5fd
Show file tree

Hide file tree

Showing 22 changed files with 1,173 additions and 216 deletions.
diff --git a/Data/TAModel/.~lock.Balakrishnan-etal_2023_RNAseq_tableS3.xlsx# b/Data/TAModel/.~lock.Balakrishnan-etal_2023_RNAseq_tableS3.xlsx#
diff --git a/Data/TAModel/Sinha-etal_2021_transcript-data.xlsx b/Data/TAModel/Sinha-etal_2021_transcript-data.xlsx
diff --git a/Data/proteinAllocationModel_iML1515_EnzymaticData_230503.xls b/Data/proteinAllocationModel_iML1515_EnzymaticData_230503.xls
diff --git a/Scripts/.ipynb_checkpoints/Translation_parameters_estimation-checkpoint.ipynb b/Scripts/.ipynb_checkpoints/Translation_parameters_estimation-checkpoint.ipynb
diff --git a/Scripts/Translation_parameters_estimation.ipynb b/Scripts/Translation_parameters_estimation.ipynb
diff --git a/Scripts/__pycache__/pam_generation.cpython-310.pyc b/Scripts/__pycache__/pam_generation.cpython-310.pyc
diff --git a/Scripts/__pycache__/toy_ec_pam.cpython-310.pyc b/Scripts/__pycache__/toy_ec_pam.cpython-310.pyc
diff --git a/Scripts/ecolicore_tam_incl_transcript_info.py b/Scripts/ecolicore_tam_incl_transcript_info.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import os
+
+from Scripts.tam_generation import set_up_toy_tam, set_up_ecolicore_tam
+sinha_ref_conditions = {
+    'Holm et al': ['REF', 'NOX', 'ATP'], #WT and NADH  overexpression conditions, mu 0.72, 0.65,0.58 h-1 respectively
+    'Ishii et al': ['WT_0.7h-1'],
+    'Gerosa et al.': ['Glycerol','Glucose','Acetate', 'Pyruvate','Gluconate','Succinate','Galactose','Fructose'],
+}
+TRANSCRIPT_FILE_PATH = os.path.join('Data', 'TAModel', 'Sinha-etal_2021_transcript-data.xlsx')
+
+
+mrna_vs_mu_slope = 2.64E-10
+mrna_vs_mu_intercept = 3.59E-11
+
+
+def get_transcript_data(transcript_file_path:str = TRANSCRIPT_FILE_PATH, mmol = True,
+                        reference: str = 'Holm et al', growth_rates = [0.72, 0.65,0.58]):
+    expression_data = pd.read_excel(transcript_file_path, sheet_name=reference, index_col=0)
+    # Normalize columns by dividing by the sum of each column
+    expression_data_normalized = expression_data.div(expression_data.sum(axis=0), axis=1)
+    if mmol:
+        mrna_conc = [mrna_vs_mu_intercept + mrna_vs_mu_slope*mu for mu in growth_rates]
+        expression_data_mmol = expression_data_normalized.apply(lambda row: row * mrna_conc, axis=1)
+        return expression_data_mmol
+    else:
+        return expression_data_normalized
+
+if __name__ == '__main__':
+    tam = set_up_ecolicore_tam()
+    tam.optimize()
+    transcript_data_mmol = get_transcript_data()
+
+    for gene, expression_data in transcript_data_mmol.iterrows():
+        transcript_id = 'mRNA_'+gene
+        if not transcript_id in tam.transcripts: continue
+        transcript = tam.transcripts.get_by_id('mRNA_'+gene)
+        #testing wildtype condition
+        transcript.change_concentration(concentration=expression_data[0],
+                                        error= expression_data[0]*0.01)
+    print(tam.reactions.get_by_id('EX_glc__D_e'))
+
+    tam.optimize()
+    print(tam.summary())
diff --git a/Scripts/pam_generation.py b/Scripts/pam_generation.py
@@ -2,11 +2,12 @@
 import pandas as pd
 import os
 from typing import Union
+import ast
 
 # load PAMpy modules
-from PAModelpy.PAModel import PAModel
-from PAModelpy.EnzymeSectors import ActiveEnzymeSector, UnusedEnzymeSector, TransEnzymeSector
-from PAModelpy.configuration import Config
+from src.PAModelpy.PAModel import PAModel
+from src.PAModelpy.EnzymeSectors import ActiveEnzymeSector, UnusedEnzymeSector, TransEnzymeSector
+from src.PAModelpy.configuration import Config
 
 from Scripts.toy_ec_pam import build_toy_gem, build_active_enzyme_sector, build_translational_protein_sector, build_unused_protein_sector
 
@@ -26,7 +27,7 @@ def set_up_toy_pam(sensitivity =True):
     active_enzyme = build_active_enzyme_sector(config)
     unused_enzyme = build_unused_protein_sector(config)
     translation_enzyme = build_translational_protein_sector(config)
-    pamodel = PAModel(model, name='toy model MCA with enzyme constraints', active_sector=active_enzyme,
+    pamodel = PAModel(model, name='toy model with enzyme constraints', active_sector=active_enzyme,
                       translational_sector=translation_enzyme,
                       unused_sector=unused_enzyme, p_tot=Etot, sensitivity=sensitivity)
     pamodel.objective = 'R7'
@@ -35,122 +36,33 @@ def set_up_toy_pam(sensitivity =True):
 
 def set_up_ecolicore_pam(total_protein:bool = True, active_enzymes: bool = True, translational_enzymes:bool = True, unused_enzymes:bool = True, sensitivity =True):
     # Setting the relative paths
-    DATA_DIR = os.path.join(os.path.split(os.getcwd())[0], 'Data')
-    MODEL_DIR = os.path.join(os.path.split(os.getcwd())[0], 'Models')
+    DATA_DIR = 'Data'
+    MODEL_DIR = 'Models'
     PAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'proteinAllocationModel_iML1515_EnzymaticData_py.xls')
+    TAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'TAModel','2024-02-16_gene_enzyme_reaction_relation_Ecoli.xlsx')
 
     # some other constants
     BIOMASS_REACTION = 'BIOMASS_Ecoli_core_w_GAM'
     TOTAL_PROTEIN_CONCENTRATION = 0.16995  # [g_prot/g_cdw]
 
+    config = Config()
+    config.reset()
+    config.BIOMASS_REACTION = BIOMASS_REACTION
+
     # load the genome-scale information
     model = cobra.io.load_json_model(os.path.join(MODEL_DIR, 'e_coli_core.json'))
 
     #load example data for the E.coli iML1515 model
     if active_enzymes:
-        # load active enzyme sector information
-        enzyme_db = pd.read_excel(PAM_DATA_FILE_PATH, sheet_name='ActiveEnzymes')
-        enzyme_db = enzyme_db.set_index('rxnID')
-        # correct reaction IDS
-        for idx in enzyme_db.index.to_list():
-            # transprt reactions<
-
-            if 'pp' in idx:
-                idx_new = idx.replace('pp', '')
-                if idx_new not in enzyme_db.index:
-                    enzyme_db.rename(index={idx: idx_new}, inplace=True)
-            if 'ex' in idx:
-                idx_new = idx.replace('ex', '')
-                if idx_new not in enzyme_db.index:
-                    enzyme_db.rename(index={idx: idx_new}, inplace=True)
-
-                    # replace NaN values with unique identifiers
-        # replace NaN enzyme ids with a dummy enzyme identifier
-        # select the NaN values
-        nan_values = enzyme_db['EC_nmbr'].isnull()
-        # make a list with unique ids
-        nan_ids = [f'E{i}' for i in range(nan_values.sum())]
-        # replace nan values by unique id
-        enzyme_db.loc[nan_values, 'EC_nmbr'] = nan_ids
+        enzyme_db = _parse_enzyme_information_from_file(TAM_DATA_FILE_PATH)
 
         # create enzyme objects for each gene-associated reaction
-        kcats = {}
-        rxn2ec = {}
-        molmass = {}
-        for rxn in model.reactions:
-            if rxn.genes:
-                # correct transport reactions
-                if 't' in rxn.id:
-                    rxn.id = rxn.id
-                # are enzyme information in the PAM database?
-                rev = 0  # denotes reversibility
-                if rxn.lower_bound >= 0:
-                    # irreversible reaction (forward direction)
-                    rev = 0
-                    rxn_id = rxn.id  # save reaction ID for retrieveing molar masses/enzyme information later
-                    if rxn.id in enzyme_db.index:
-                        kcats[rxn.id] = {'f': enzyme_db.loc[rxn.id, 'kcat']}
-                elif rxn.upper_bound <= 0:
-                    # irreversible reaction (reverse direction)
-                    rev = 1
-                    rxn_id = rxn.id + '_b'
-                    if rxn_id in enzyme_db.index:
-                        kcats[rxn.id] = {'b': enzyme_db.loc[rxn_id, 'kcat']}
-                else:
-                    rev = 2
-                    # reversible reaction
-                    rxn_id_f = rxn.id + '_f'
-                    rxn_id_b = rxn.id + '_b'
-                    if rxn_id_f in enzyme_db.index and rxn_id_b in enzyme_db.index:
-                        rxn_id = rxn_id_f  # save reaction ID for retrieveing molar masses/enzyme information later
-                        kcats[rxn.id] = {'f': enzyme_db.loc[rxn_id_f, 'kcat'],
-                                         'b': enzyme_db.loc[rxn_id_b, 'kcat']}
-
-                    else:
-                        # try if only forward reaction is in database
-                        rxn_id = rxn.id  # save reaction ID for retrieveing molar masses/enzyme information later
-                        kcats[rxn.id] = {'f': enzyme_db.loc[rxn.id, 'kcat'],
-                                         'b': enzyme_db.loc[
-                                                  rxn.id, 'kcat'] / 2}  # deduce backwards kcat from forward value
-
-                # where enzyme information found?
-                if rxn.id in kcats.keys():
-                    # save molmass
-                    molmass[rxn.id] = enzyme_db.loc[rxn_id, 'molMass']
-                    # save enzyme information
-                    # is enzyme information NaN?
-                    if pd.isna(enzyme_db.loc[rxn_id, 'EC_nmbr']):
-                        rxn2ec[rxn.id] = ''
-                    else:
-                        rxn2ec[rxn.id] = enzyme_db.loc[rxn_id, 'EC_nmbr']
-
-
-                else:
-                    # no enzyme information found
-                    print('No enzyme information found for reaction: ' + rxn.id)
-                    # Create generic Enzyme with mean molar masses and kcat
-                    if rev == 0:
-                        kcats[rxn.id] = {'f': 22}
-                    elif rev == 1:
-                        kcats[rxn.id] = {'b': 22}
-                    else:
-                        kcats[rxn.id] = {'f': 22, 'b': 22}
-
-                    molmass[rxn.id] = 3.947778784340140e04
-
-        rxn2protein = {}
-        for rxn, ec in rxn2ec.items():
-            ec_dict = {**kcats[rxn], **{'molmass': molmass[rxn]}}
-            # add enzyme to enzymes related to reaction if these are already stored
-            if rxn in rxn2protein.keys():
-                rxn2protein[rxn] = {**rxn2protein[rxn], **{ec: ec_dict}}
-            # if not create new reaction entry
-            else:
-                rxn2protein[rxn] = {ec: ec_dict}
+        rxn2protein, protein2gene, gene2transcript = parse_reaction2protein(enzyme_db, model)
 
         # create active enzymes sector
-        active_enzyme_sector = ActiveEnzymeSector(rxn2protein=rxn2protein)
-
+        active_enzyme_sector = ActiveEnzymeSector(rxn2protein=rxn2protein,
+                                                  protein2gene=protein2gene,
+                                                  configuration=config)
     else:
         active_enzyme_sector = None
 
@@ -340,3 +252,168 @@ def parse_coefficients(pamodel):
 def parse_esc(pamodel):
     return pamodel.enzyme_sensitivity_coefficients.coefficient.to_list()
 
+
+def _parse_enzyme_information_from_file(file_path:str):
+    # load active enzyme sector information
+    enzyme_db = pd.read_excel(file_path, sheet_name='enzyme-gene-reaction')
+    # enzyme_db = enzyme_db.set_index('rxnID')
+    # correct reaction IDS
+    for idx in enzyme_db.rxnID.to_list():
+        # transport reactions
+        if 'pp' in idx:
+            idx_new = idx.replace('pp', '')
+            if idx_new not in enzyme_db.index:
+                enzyme_db.rename(index={idx: idx_new}, inplace=True)
+        if 'ex' in idx:
+            idx_new = idx.replace('ex', '')
+            if idx_new not in enzyme_db.index:
+                enzyme_db.rename(index={idx: idx_new}, inplace=True)
+
+                # replace NaN values with unique identifiers
+    # replace NaN enzyme ids with a dummy enzyme identifier
+    # select the NaN values
+    nan_values = enzyme_db['EC_nmbr'].isnull()
+    # make a list with unique ids
+    nan_ids = [f'E{i}' for i in range(nan_values.sum())]
+    # replace nan values by unique id
+    enzyme_db.loc[nan_values, 'EC_nmbr'] = nan_ids
+
+    return enzyme_db
+
+
+def parse_reaction2protein(enzyme_db: pd.DataFrame, model:cobra.Model) -> dict:
+    # Initialize dictionaries
+    rxn2protein = {}
+    protein2gene = {}
+    gene2transcript = {'gene_dummy': {'id': 'mRNA_gene_dummy', 'length': 750.0}}
+
+    # Iterate over each row in the DataFrame
+    for index, row in enzyme_db.iterrows():
+        # Parse data from the row
+        rxn_id = row['rxnID']
+        rxn_id = _check_rxn_identifier_format(rxn_id)
+        #only parse those reactions which are in the model
+        kcat_f_b = _get_fwd_bckw_kcat(rxn_id, row['kcat'], model)
+        if kcat_f_b is None: continue
+        #check if there is a forward or backward string which needs to be removed from the rxn id
+        if any([rxn_id.endswith('_f'), rxn_id.endswith('_b')]): rxn_id = rxn_id[:-2]
+
+        #get the identifiers and replace nan values by dummy placeholders
+        enzyme_id = row['EC_nmbr']
+        if not isinstance(enzyme_id, str): enzyme_id = 'Enzyme_'+rxn_id
+        gene_id = row['gene_id']
+        if not isinstance(gene_id, str): gene_id = 'gene_dummy'
+        mrna_length = row['mrna_length']
+
+        #get the gene-protein-reaction-associations
+        gpr = parse_gpr_information_for_protein2genes(row['gpr'])
+
+        # Create gene-protein-reaction associations
+        if enzyme_id not in protein2gene:
+            protein2gene[enzyme_id] = []
+        protein2gene[enzyme_id].append(gpr)
+
+        # Create gene2transcript dictionary
+        if gene_id not in gene2transcript.keys():
+            gene2transcript[gene_id] = {'id': f'mRNA_{gene_id}', 'length': mrna_length}
+
+        # Create rxn2protein dictionary
+        if rxn_id not in rxn2protein:
+            rxn2protein[rxn_id] = {}
+        if enzyme_id not in rxn2protein[rxn_id]:
+            rxn2protein[rxn_id][enzyme_id] = {
+                'f': kcat_f_b[0],  # Forward kcat
+                'b': kcat_f_b[1],  # Backward kcat
+                'molmass': row['molMass'],
+                'genes': [gene_id],
+                'complex_with': None
+            }
+        else:
+            rxn2protein[rxn_id][enzyme_id]['genes'].append(gene_id)
+
+    #if no enzyme info is found, add dummy enzyme with median kcat and molmass
+    for rxn in model.reactions:
+        if rxn.id not in rxn2protein.keys() and 'EX' not in rxn.id and 'BIOMASS' not in rxn.id and rxn.genes:
+            rev = _check_reaction_reversibility(rxn)
+            if rev == 0:
+                kcat_dict = {'f': 22}
+            elif rev ==1:
+                kcat_dict = {'b': 22}
+            else:
+                kcat_dict = {'f': 22,'b': 22}
+            # no enzyme information found
+            print('No enzyme information found for reaction: ' + rxn.id)
+            enzyme_id = 'Enzyme_'+rxn.id
+            rxn2protein[rxn.id] = {enzyme_id:{
+                **kcat_dict,
+                'molmass': 3.947778784340140e04}}
+            #add geneinfo for unknown enzymes
+            gpr_info = parse_gpr_information_for_protein2genes(rxn.gpr)
+            protein2gene[enzyme_id] = gpr_info
+
+            for gene in [gene for sublist in gpr_info for gene in sublist]:
+                if gene not in gene2transcript.keys():
+                    gene2transcript[gene] = {'id': f'mRNA_{gene}', 'length': 750.0}
+
+    return rxn2protein, protein2gene, gene2transcript
+
+def _check_rxn_identifier_format(rxn_id:str) -> str:
+    if 'pp' in rxn_id:
+        idx_new = rxn_id.replace('pp', '')
+    elif 'ex' in rxn_id:
+        idx_new = rxn_id.replace('ex', '')
+    else:
+        idx_new = rxn_id
+    return idx_new
+
+def _get_fwd_bckw_kcat(rxn_id: str, kcat:float, model:PAModel) -> Union[list, None]:
+    #skip exchange and biomass reaction
+    if 'EX' in rxn_id or 'BIOMASS' in rxn_id:
+        return None
+
+    # Extract the base identifier without any suffixes
+    base_id = rxn_id.split('_')[0]
+
+    # Iterate over each identifier in the input
+    if base_id in model.reactions:
+        # Determine the form of the identifier
+        if rxn_id.endswith('_f'):
+            kcat_fwd = kcat
+            kcat_rev = 0
+        elif rxn_id.endswith('_b'):
+            kcat_fwd = 0
+            kcat_rev = kcat
+        # identifier has no suffix
+        elif base_id == rxn_id:
+            kcat_fwd = kcat
+            kcat_rev = kcat
+        else:
+            return None
+    elif rxn_id in model.reactions:
+        kcat_fwd = kcat
+        kcat_rev = kcat
+    else:
+        return None
+    return [kcat_fwd, kcat_rev]
+
+def _check_reaction_reversibility(reaction):
+    if reaction.lower_bound >= 0:
+        # irreversible reaction (forward direction)
+        rev = 0
+    elif reaction.upper_bound <= 0:
+        # irreversible reaction (reverse direction)
+        rev = 1
+    else:
+        rev = 2
+        # reversible r
+    return rev
+
+def parse_gpr_information_for_protein2genes(gpr_info:str):
+    #filter out nan entries
+    if isinstance(gpr_info, str):
+        nested_list = ast.literal_eval(gpr_info)
+        # Extracting the inner lists and removing parentheses
+        gpr_list = [[[item.strip("(')")] + sublist[1:] for item in sublist[0].split(" or ")] for sublist in nested_list]
+        return gpr_list[0]
+    else:
+        return [['gene_dummy']]