integrated transcript data

iAMB-RWTH-Aachen · Feb 28, 2024 · 146fe83 · 146fe83
1 parent 8c1b5fd
commit 146fe83
Show file tree

Hide file tree

Showing 13 changed files with 868 additions and 197 deletions.
diff --git a/Data/TAModel/.~lock.Sinha-etal_2021_transcript-data.xlsx# b/Data/TAModel/.~lock.Sinha-etal_2021_transcript-data.xlsx#
@@ -0,0 +1 @@
+,samiralvdb,QPE-IAMBPC156,28.02.2024 17:17,file:///home/samiralvdb/.config/libreoffice/4;
diff --git a/Data/TAModel/2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx b/Data/TAModel/2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx
diff --git a/Data/TAModel/GeneList_ecoli.xlsx b/Data/TAModel/GeneList_ecoli.xlsx
diff --git a/Data/TAModel/Sinha-etal_2021_flux-data.xlsx b/Data/TAModel/Sinha-etal_2021_flux-data.xlsx
diff --git a/Scripts/.ipynb_checkpoints/create_ecolicore_pam_incl_UE-checkpoint.ipynb b/Scripts/.ipynb_checkpoints/create_ecolicore_pam_incl_UE-checkpoint.ipynb
diff --git a/Scripts/create_ecolicore_pam_incl_UE.ipynb b/Scripts/create_ecolicore_pam_incl_UE.ipynb
diff --git a/Scripts/ecolicore_tam_incl_transcript_info.py b/Scripts/ecolicore_tam_incl_transcript_info.py
@@ -1,13 +1,17 @@
+import matplotlib.pyplot as plt
 import pandas as pd
 import os
 
 from Scripts.tam_generation import set_up_toy_tam, set_up_ecolicore_tam
+from Scripts.pam_generation import set_up_ecolicore_pam
+
 sinha_ref_conditions = {
     'Holm et al': ['REF', 'NOX', 'ATP'], #WT and NADH  overexpression conditions, mu 0.72, 0.65,0.58 h-1 respectively
     'Ishii et al': ['WT_0.7h-1'],
     'Gerosa et al.': ['Glycerol','Glucose','Acetate', 'Pyruvate','Gluconate','Succinate','Galactose','Fructose'],
 }
 TRANSCRIPT_FILE_PATH = os.path.join('Data', 'TAModel', 'Sinha-etal_2021_transcript-data.xlsx')
+FLUX_FILE_PATH = os.path.join('Data', 'TAModel', 'Sinha-etal_2021_flux-data.xlsx')
 
 
 mrna_vs_mu_slope = 2.64E-10
@@ -26,19 +30,122 @@ def get_transcript_data(transcript_file_path:str = TRANSCRIPT_FILE_PATH, mmol =
     else:
         return expression_data_normalized
 
-if __name__ == '__main__':
+def get_flux_data(flux_file_path:str = FLUX_FILE_PATH,
+                        reference: str = 'Holm et al'):
+    expression_data = pd.read_excel(flux_file_path, sheet_name=reference, index_col=0)
+    if reference == 'Holm et al':
+        #remove R suffix
+        expression_data.index = expression_data.index.str.replace('R_', '')
+    return expression_data
+
+def get_pam_fluxes(substrate_uptake_rate):
+    pam = set_up_ecolicore_pam()
+    pam.change_reaction_bounds('EX_glc__D_e', lower_bound=-substrate_uptake_rate, upper_bound=0)
+    sol = pam.optimize()
+    pam_fluxes = sol.fluxes
+    return pam_fluxes
+
+def set_up_tamodel(strain ='REF'):
     tam = set_up_ecolicore_tam()
-    tam.optimize()
+    tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-1e6, upper_bound=0)
     transcript_data_mmol = get_transcript_data()
 
     for gene, expression_data in transcript_data_mmol.iterrows():
-        transcript_id = 'mRNA_'+gene
+        transcript_id = 'mRNA_' + gene
         if not transcript_id in tam.transcripts: continue
-        transcript = tam.transcripts.get_by_id('mRNA_'+gene)
-        #testing wildtype condition
-        transcript.change_concentration(concentration=expression_data[0],
-                                        error= expression_data[0]*0.01)
-    print(tam.reactions.get_by_id('EX_glc__D_e'))
-
-    tam.optimize()
-    print(tam.summary())
+        transcript = tam.transcripts.get_by_id('mRNA_' + gene)
+        # testing wildtype condition
+        transcript.change_concentration(concentration=expression_data[strain],
+                                        error=expression_data[strain] * 0.01)
+    return tam
+
+def get_tam_fluxes(tam,substrate_uptake_rate):
+    tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-substrate_uptake_rate, upper_bound=0)
+    sol = tam.optimize()
+    tam_fluxes = sol.fluxes
+    return tam_fluxes
+
+def compare_flux_data(flux_data, pam_fluxes, tam_fluxes, strain ='REF', abs=True):
+    if abs:
+        glc_upt_ref = 1
+        glc_upt_pam = 1
+        glc_upt_tam = 1
+    else:
+        glc_upt_ref = flux_data[strain]['GLCptspp']
+        glc_upt_pam = pam_fluxes['GLCpts']
+        glc_upt_tam = tam_fluxes['GLCpts']
+
+    flux_results = flux_data[[strain]]
+    flux_results_percentage = flux_results.assign(
+        strain=lambda val: val[strain] / glc_upt_ref)
+    flux_results_percentage['PAM'] = 0
+    flux_results_percentage['TAM'] = 0
+    for rxn in flux_data.index:
+        ori_rxn = rxn
+        if 'pp' in rxn: rxn = rxn.replace('pp', '')
+        if 'biomass' in rxn: rxn = 'BIOMASS_Ecoli_core_w_GAM'
+        if 'EX_glc' in rxn: rxn = 'EX_glc__D_e'
+        if 'EX_ac' in rxn: rxn = 'EX_ac_e'
+        flux_results_percentage['PAM'][ori_rxn] = pam_fluxes[rxn] / glc_upt_pam
+        flux_results_percentage['TAM'][ori_rxn] = tam_fluxes[rxn] / glc_upt_tam
+
+    print(flux_results_percentage.to_markdown())
+    return flux_results_percentage
+
+def compare_fluxes_holm_reference(strain = 'REF'):
+    flux_data =get_flux_data()
+    substrate_uptake_rate = flux_data[strain]['GLCptspp']
+
+    pam_fluxes = get_pam_fluxes(substrate_uptake_rate=substrate_uptake_rate)
+
+    tam = set_up_tamodel(strain)
+    tam_fluxes = get_tam_fluxes(tam, substrate_uptake_rate=substrate_uptake_rate)
+    for i,row in tam.capacity_sensitivity_coefficients.iterrows():
+        if row.coefficient > 0: print(row)
+
+    flux_relative = compare_flux_data(flux_data, pam_fluxes, tam_fluxes,  strain,abs = False)
+    flux_absolute = compare_flux_data(flux_data, pam_fluxes, tam_fluxes,  strain)
+
+    plot_flux_comparison(flux_absolute, flux_relative, strain)
+
+def plot_flux_comparison(flux_df_abs, flux_df_rel, strain):
+    fig, ax = plt.subplots(1,2)
+
+    ax[0].scatter(flux_df_abs['TAM'], flux_df_abs['strain'], color = 'black')
+    ax[0].scatter(flux_df_abs['PAM'], flux_df_abs['strain'], color ='red')
+    #reference line
+    ax[0].plot(flux_df_abs['strain'], flux_df_abs['strain'], linestyle ='dashed')
+
+    ax[0].set_title(strain + ' absolute fluxes')
+    ax[0].set_xlabel('simulated flux [$mmol/g_{CDW}/h$]')
+    ax[0].set_ylabel('measured flux [$mmol/g_{CDW}/h$]')
+
+    ax[1].scatter(flux_df_rel['TAM'], flux_df_rel['strain'], color='black', label = 'TAM')
+    ax[1].scatter(flux_df_rel['PAM'], flux_df_rel['strain'], color='red', label = 'PAM')
+    # reference line
+    ax[1].plot(flux_df_rel['strain'], flux_df_rel['strain'], linestyle='dashed')
+
+    ax[1].set_title(strain + ' relative fluxes')
+    ax[1].set_xlabel('simulated flux [$mmol/g_{CDW}/h$]')
+    ax[1].set_ylabel('measured flux [$mmol/g_{CDW}/h$]')
+
+    fig.set_figwidth(20)
+    fig.set_figheight(10)
+    plt.legend()
+    plt.show()
+
+
+if __name__ == '__main__':
+    print('Reference condition')
+    compare_fluxes_holm_reference()
+    print('\n-------------------------------------------------------------------------------------------------')
+    print('mutation 1: NOX strain (overexpression of NADH oxidase)\n')
+    compare_fluxes_holm_reference('NOX')
+    # TODO print mRNA and protein concentrations to compare with lb
+    # TODO print shadowprices of mRNA (are lbs hit? how far can I constrain?)
+
+
+
+
+
+
diff --git a/Scripts/pam_generation.py b/Scripts/pam_generation.py
@@ -39,7 +39,7 @@ def set_up_ecolicore_pam(total_protein:bool = True, active_enzymes: bool = True,
     DATA_DIR = 'Data'
     MODEL_DIR = 'Models'
     PAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'proteinAllocationModel_iML1515_EnzymaticData_py.xls')
-    TAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'TAModel','2024-02-16_gene_enzyme_reaction_relation_Ecoli.xlsx')
+    TAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'TAModel','2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx')
 
     # some other constants
     BIOMASS_REACTION = 'BIOMASS_Ecoli_core_w_GAM'
@@ -376,6 +376,7 @@ def _get_fwd_bckw_kcat(rxn_id: str, kcat:float, model:PAModel) -> Union[list, No
 
     # Iterate over each identifier in the input
     if base_id in model.reactions:
+        if not model.reactions.get_by_id(base_id).genes: return None
         # Determine the form of the identifier
         if rxn_id.endswith('_f'):
             kcat_fwd = kcat
@@ -390,6 +391,7 @@ def _get_fwd_bckw_kcat(rxn_id: str, kcat:float, model:PAModel) -> Union[list, No
         else:
             return None
     elif rxn_id in model.reactions:
+        if not model.reactions.get_by_id(rxn_id).genes: return None
         kcat_fwd = kcat
         kcat_rev = kcat
     else:

diff --git a/Scripts/parse_ecoli_gpr_info.py b/Scripts/parse_ecoli_gpr_info.py
@@ -51,8 +51,13 @@ def parse_gpr_relationships_from_Ecocyc():
          'Molecular-Weight-KiloDaltons', 'mrna_length', 'gpr']]
     enzyme_gene_reaction_relation.columns = ['Gene', 'Enzyme', 'Reaction', 'gene_id', 'enzyme_id',
                                              'molmass_kDa', 'mrna_length', 'gpr']
+
+    #Get enzyme and kcat information already available
     tam_info_merged = parse_enzymatic_data_information(enzyme_gene_reaction_relation)
 
+    #Append the information of each gene with information from the 'GeneList'
+    tam_info_merged = parse_ecoli_genome_information(tam_info_merged)
+
     #write to excel
     with pd.ExcelWriter(TAM_DATA_FILE) as writer:
         tam_info_merged.to_excel(writer, sheet_name='enzyme-gene-reaction')
@@ -82,7 +87,15 @@ def parse_enzymatic_data_information(enzyme_gene_reaction_relation):
     tam_info = tam_info.drop(['molmass_kDa', 'Reaction'], axis = 1)
     return tam_info
 
+def parse_ecoli_genome_information(tam_info_merged):
+    genome_information = pd.read_excel(os.path.join('Data', 'TAModel','GeneList_ecoli.xlsx'),
+                                sheet_name='GeneList').set_index('bnumber')
+    genome_info_useful = genome_information[['start', 'end']]
+    for bnumber, row in genome_info_useful.iterrows():
+        tam_info_gene = tam_info_merged[tam_info_merged.gene_id == bnumber]
+        tam_info_gene['mrna_length'] = row.start - row.end
 
+    return tam_info_merged
 
 if __name__ == '__main__':
     parse_gpr_relationships_from_Ecocyc()
diff --git a/Scripts/tam_generation.py b/Scripts/tam_generation.py
@@ -42,12 +42,14 @@ def set_up_ecolicore_tam(total_protein:bool = True, active_enzymes: bool = True,
     # Setting the relative paths
     DATA_DIR = os.path.join('Data')
     MODEL_DIR = os.path.join('Models')
-    TAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'TAModel','2024-02-16_gene_enzyme_reaction_relation_Ecoli.xlsx')
+    TAM_DATA_FILE_PATH = os.path.join(DATA_DIR, 'TAModel','2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx')
 
 
     # some other constants
     BIOMASS_REACTION = 'BIOMASS_Ecoli_core_w_GAM'
-    TOTAL_PROTEIN_CONCENTRATION = 0.16995  # [g_prot/g_cdw]
+    # TOTAL_PROTEIN_CONCENTRATION = 0.16995  # [g_prot/g_cdw]
+    TOTAL_PROTEIN_CONCENTRATION = 0.185  # [g_prot/g_cdw]
+
     MRNA_MU = 0.00013049558330984208 # [g_mrna/g_cdw/h]
     MRNA_0= 1.7750480089801658e-05 # [g_mrna/g_cdw]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,samiralvdb,QPE-IAMBPC156,28.02.2024 17:17,file:///home/samiralvdb/.config/libreoffice/4;