-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create Python_RNASeq_amazonQ_result.txt
- Loading branch information
1 parent
eff2004
commit a34eac9
Showing
1 changed file
with
123 additions
and
0 deletions.
There are no files selected for viewing
123 changes: 123 additions & 0 deletions
123
docs/chatbot_comparison/code_dev/Python_RNASeq_amazonQ_result.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Differential Expression Analysis using edgeR via rpy2 | ||
This script performs RNA-Seq differential expression analysis using the edgeR package. | ||
""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
from rpy2.robjects import pandas2ri, r | ||
from rpy2.robjects.packages import importr | ||
import rpy2.robjects as ro | ||
import warnings | ||
|
||
# Suppress warnings | ||
warnings.filterwarnings("ignore") | ||
|
||
def load_count_data(file_path): | ||
""" | ||
Load RNA-Seq count data from CSV file | ||
Args: | ||
file_path (str): Path to CSV file containing count data | ||
Returns: | ||
pandas.DataFrame: Count data with genes as rows and samples as columns | ||
""" | ||
counts_df = pd.read_csv(file_path) | ||
return counts_df | ||
|
||
def perform_edger_analysis(counts_df, condition1_samples, condition2_samples, min_count=1): | ||
""" | ||
Perform differential expression analysis using edgeR | ||
Args: | ||
counts_df (pandas.DataFrame): Count data with genes as rows and samples as columns | ||
condition1_samples (list): List of sample names for condition 1 | ||
condition2_samples (list): List of sample names for condition 2 | ||
min_count (int): Minimum count threshold for filtering genes | ||
Returns: | ||
pandas.DataFrame: Results of differential expression analysis | ||
""" | ||
# Convert pandas DataFrame to R DataFrame | ||
pandas2ri.activate() | ||
|
||
# Import required R packages | ||
edger = importr('edgeR') | ||
stats = importr('stats') | ||
base = importr('base') | ||
|
||
# Prepare count matrix | ||
count_matrix = counts_df.set_index('Gene ID') | ||
all_samples = condition1_samples + condition2_samples | ||
count_matrix = count_matrix[all_samples] | ||
|
||
# Create DGEList object | ||
r_counts = pandas2ri.py2rpy(count_matrix) | ||
group = ro.FactorVector(['group1'] * len(condition1_samples) + ['group2'] * len(condition2_samples)) | ||
dge = edger.DGEList(counts=r_counts, group=group) | ||
|
||
# Filter low expression genes | ||
keep = r.rowSums(dge.rx2('counts'))()/dge.rx2('samples').rx2('lib.size').ro[0] > min_count | ||
dge = dge.rx(keep, True) | ||
|
||
# Calculate normalization factors | ||
dge = edger.calcNormFactors(dge) | ||
|
||
# Estimate dispersion | ||
dge = edger.estimateCommonDisp(dge) | ||
dge = edger.estimateTagwiseDisp(dge) | ||
|
||
# Perform exact test for differential expression | ||
et = edger.exactTest(dge) | ||
|
||
# Get results | ||
results = pandas2ri.rpy2py(base.as_data_frame(et.rx2('table'))) | ||
results.index = count_matrix[keep].index | ||
results.reset_index(inplace=True) | ||
|
||
# Add adjusted p-values | ||
results['adj.P.Val'] = stats.p_adjust(results['PValue'], method='BH') | ||
|
||
# Add differential expression status | ||
results['Status'] = 'Not Significant' | ||
results.loc[(abs(results['logFC']) >= 1) & (results['adj.P.Val'] < 0.05), 'Status'] = 'Significant' | ||
|
||
# Rename columns | ||
results.columns = ['Gene ID', 'log2FC', 'logCPM', 'P.Value', 'Adj.P.Value', 'DE.Status'] | ||
|
||
return results[['Gene ID', 'log2FC', 'P.Value', 'Adj.P.Value', 'DE.Status']] | ||
|
||
def main(): | ||
""" | ||
Main function to run the differential expression analysis | ||
""" | ||
# Example usage | ||
input_file = "counts.csv" | ||
output_file = "differential_expression_results.csv" | ||
|
||
# Define conditions | ||
condition1_samples = ["Sample1", "Sample2", "Sample3"] | ||
condition2_samples = ["Sample4", "Sample5", "Sample6"] | ||
|
||
try: | ||
# Load count data | ||
print("Loading count data...") | ||
counts_df = load_count_data(input_file) | ||
|
||
# Perform differential expression analysis | ||
print("Performing differential expression analysis...") | ||
results = perform_edger_analysis(counts_df, condition1_samples, condition2_samples) | ||
|
||
# Save results | ||
print(f"Saving results to {output_file}...") | ||
results.to_csv(output_file, index=False) | ||
print("Analysis complete!") | ||
|
||
except Exception as e: | ||
print(f"Error: {str(e)}") | ||
|
||
if __name__ == "__main__": | ||
main() |