Skip to content

Commit

Permalink
Create Python_RNASeq_amazonQ_result.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
RamiyapriyaS authored Jan 22, 2025
1 parent eff2004 commit a34eac9
Showing 1 changed file with 123 additions and 0 deletions.
123 changes: 123 additions & 0 deletions docs/chatbot_comparison/code_dev/Python_RNASeq_amazonQ_result.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python3

"""
Differential Expression Analysis using edgeR via rpy2
This script performs RNA-Seq differential expression analysis using the edgeR package.
"""

import pandas as pd
import numpy as np
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

def load_count_data(file_path):
"""
Load RNA-Seq count data from CSV file
Args:
file_path (str): Path to CSV file containing count data
Returns:
pandas.DataFrame: Count data with genes as rows and samples as columns
"""
counts_df = pd.read_csv(file_path)
return counts_df

def perform_edger_analysis(counts_df, condition1_samples, condition2_samples, min_count=1):
"""
Perform differential expression analysis using edgeR
Args:
counts_df (pandas.DataFrame): Count data with genes as rows and samples as columns
condition1_samples (list): List of sample names for condition 1
condition2_samples (list): List of sample names for condition 2
min_count (int): Minimum count threshold for filtering genes
Returns:
pandas.DataFrame: Results of differential expression analysis
"""
# Convert pandas DataFrame to R DataFrame
pandas2ri.activate()

# Import required R packages
edger = importr('edgeR')
stats = importr('stats')
base = importr('base')

# Prepare count matrix
count_matrix = counts_df.set_index('Gene ID')
all_samples = condition1_samples + condition2_samples
count_matrix = count_matrix[all_samples]

# Create DGEList object
r_counts = pandas2ri.py2rpy(count_matrix)
group = ro.FactorVector(['group1'] * len(condition1_samples) + ['group2'] * len(condition2_samples))
dge = edger.DGEList(counts=r_counts, group=group)

# Filter low expression genes
keep = r.rowSums(dge.rx2('counts'))()/dge.rx2('samples').rx2('lib.size').ro[0] > min_count
dge = dge.rx(keep, True)

# Calculate normalization factors
dge = edger.calcNormFactors(dge)

# Estimate dispersion
dge = edger.estimateCommonDisp(dge)
dge = edger.estimateTagwiseDisp(dge)

# Perform exact test for differential expression
et = edger.exactTest(dge)

# Get results
results = pandas2ri.rpy2py(base.as_data_frame(et.rx2('table')))
results.index = count_matrix[keep].index
results.reset_index(inplace=True)

# Add adjusted p-values
results['adj.P.Val'] = stats.p_adjust(results['PValue'], method='BH')

# Add differential expression status
results['Status'] = 'Not Significant'
results.loc[(abs(results['logFC']) >= 1) & (results['adj.P.Val'] < 0.05), 'Status'] = 'Significant'

# Rename columns
results.columns = ['Gene ID', 'log2FC', 'logCPM', 'P.Value', 'Adj.P.Value', 'DE.Status']

return results[['Gene ID', 'log2FC', 'P.Value', 'Adj.P.Value', 'DE.Status']]

def main():
"""
Main function to run the differential expression analysis
"""
# Example usage
input_file = "counts.csv"
output_file = "differential_expression_results.csv"

# Define conditions
condition1_samples = ["Sample1", "Sample2", "Sample3"]
condition2_samples = ["Sample4", "Sample5", "Sample6"]

try:
# Load count data
print("Loading count data...")
counts_df = load_count_data(input_file)

# Perform differential expression analysis
print("Performing differential expression analysis...")
results = perform_edger_analysis(counts_df, condition1_samples, condition2_samples)

# Save results
print(f"Saving results to {output_file}...")
results.to_csv(output_file, index=False)
print("Analysis complete!")

except Exception as e:
print(f"Error: {str(e)}")

if __name__ == "__main__":
main()

0 comments on commit a34eac9

Please sign in to comment.