-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
105 lines (65 loc) · 10.6 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# imports
import os
import argparse
import pandas as pd
'''
This script essentially orchestrates all the other scripts required to perform the differential gene expression analysis. It is required to be run from the home directory of this project.
'''
# the paths below written into the code are not hardcoded but rather the ones that reflect the data folders created by the first bash script that runs (just as a reference in case it looks misleading)
# command line arguments defined
parser = argparse.ArgumentParser(description='differential expression pipeline') # defining the argument parser
parser.add_argument('-s', '--input', help='input file with NCBI links', required=False, default='testData/links/fileLinks.txt') # this is the input file with NCBI links if you want to download fastq files from scratch
parser.add_argument('-i', '--index', help='input accession id for index', required=False, default='NC_006273.2') # this is the id for the reference database which defaults to the one in the problem but can be substituted
parser.add_argument('-e', '--email', help='input email for NCBI access', required=True) # this is required and is the email for the ability to use Entrez
parser.add_argument('-m', '--metatable', help='metatable tab deliminated', required=False, default='./testData/metatable.tsv') # this is the metadata table that is not required but can be substituted for the default one
parser.add_argument('-l', '--logfile', help='name/path of log file', required=False, default='./PipelineProject_Rohan_Sethi/PipelineProject.log') # this is the name/path of where the log file will be created defaults into the folder
parser.add_argument('-n', '--name', help='name of reference to blast against', required=False, default='Betaherpesvirinae') # this is the name of the species that the sequences will be blasted against; defaults to the one on the problem but can be substituted
parser.add_argument('-b', '--blastdb', help='if already have blast genome fasta file, then input path to it here', required=False, default=None)
parser.add_argument('-u', '--numSelect', help='number of blast results to store', required=False, default=10) # this is the argument to determine the number of significant blast results to store in the final csv file; defaults to 10 like in the problem
parser.add_argument('-t', '--testData', help='input test data folder name', required=False, default=None) # this is the folder of the testdata to run the test; need to input this
if __name__ == '__main__': # this will run everything below only if run from the command line and not imported as a package
args = parser.parse_args() # creating an instance of arguments
if args.testData == None: # if no user input of the testData folder, then runs from beginning; not the test run that is in the else conditional
os.system('chmod +x ./scripts/data.sh') # this is making the bash script executable from the command lined
os.system(f'./scripts/data.sh {args.logfile} {args.input}') # this is a bash file that will create all the necessary output folders and download the sequences from the links provided
os.system('python3 ./scripts/decompressFastQ.py -i ./PipelineProject_Rohan_Sethi/data/raw -o ./PipelineProject_Rohan_Sethi/data/fastq') # this is a python script that will run fastqdump on all the SRA files downladed
os.system(f'python3 ./scripts/retreiveIndex.py -e {args.email} -a {args.index} -o ./PipelineProject_Rohan_Sethi/data/index/raw.fasta -p ./PipelineProject_Rohan_Sethi/data/index/protein.fasta -l {args.logfile}') # this will retreive the fasta files for the reference genome (protein and nucleotides because will use the most significantly expressed protein sequence for blast in the last step)
os.system(f'time kallisto index -i ./PipelineProject_Rohan_Sethi/data/index/index.idx ./PipelineProject_Rohan_Sethi/data/index/raw.fasta') # this will created the index out of the nucleotide sequences using kallisto
meta = pd.read_table(args.metatable) # this opens the metatable
names = list(meta['sample']) # this captures all the SRA file names
for fn in names: # for every SRA file
f1 = f'./PipelineProject_Rohan_Sethi/data/fastq/{fn}_1.fastq' # save the path of the forward read
f2 = f'./PipelineProject_Rohan_Sethi/data/fastq/{fn}_2.fastq' # save the path of the reverse read
os.system(f'time kallisto quant -i ./PipelineProject_Rohan_Sethi/data/index/index.idx -o ./PipelineProject_Rohan_Sethi/results/{fn} -b 30 -t 4 {f1} {f2}') # quantify using kallisto the transcripts per million to determine the abundance of each gene expressed; the gene is determined by aligning to the index and the number of alignments = the transcripts which is normalized to get TPMs
os.system(f'python3 ./scripts/quantify.py -m {args.metatable} -r ./PipelineProject_Rohan_Sethi/results -l {args.logfile}') # here I save the summary of the TPMS per sample and store it in the logfile
os.system(f'Rscript ./scripts/diffExpAnalysis.R {args.metatable} {args.logfile} ./PipelineProject_Rohan_Sethi/results/sigDiffExp.tsv') # here I run an R script I wrote to find the most significantly differentially expressed TPMS between samples; the ids are stored in a file that will be accessed by other scripts
os.system(f'python3 ./scripts/sigExpSpecies.py -i ./PipelineProject_Rohan_Sethi/data/index/protein.fasta -s ./PipelineProject_Rohan_Sethi/results/sigDiffExp.tsv -o ./PipelineProject_Rohan_Sethi/data/blast/mostDifferentiallyExpressed.fasta') # here I find the single most signficant differentially expressed gene from the step above and I store its protein sequence by extracting it from the protein fasta I made of the reference using the protein id
os.system('rm ./PipelineProject_Rohan_Sethi/results/sigDiffExp.tsv') # here I remove an intermediate file that is not necessary anymore
if args.blastdb == None: # this step takes a long time depending on how many sequences there are from the search so can also optionally input the path to a previously downloaded fasta to disable this step; if doing so make sure that the -n flag is appropriately updated
os.system(f'python3 ./scripts/blastReference.py -n {args.name} -e {args.email} -o ./PipelineProject_Rohan_Sethi/data/blast/{args.name}.fasta') # here I extract the genome files of the Betaherpesvirinae from NCBI; this can be changed to a different one but by default uses Betaherpesvirinae
else:
os.system(f'cp {args.blastdb} ./PipelineProject_Rohan_Sethi/data/blast') # this will copy the fasta file to the blast directory which will later be invoked for database compilation
os.system('chmod +x ./scripts/blast.sh') # here I make this shell script executable
os.system(f'./scripts/blast.sh {args.name} ./PipelineProject_Rohan_Sethi/data/blast/{args.name}.fasta ./PipelineProject_Rohan_Sethi/data/blast/mostDifferentiallyExpressed.fasta ./PipelineProject_Rohan_Sethi/data/blast/{args.name} ./PipelineProject_Rohan_Sethi/results/blastResults.csv') # here I run the blast tblastn, blasting the most significant protein against the Betavirinae genome sequences
os.system(f'python3 ./scripts/selectN.py -i ./PipelineProject_Rohan_Sethi/results/blastResults.csv -n {args.numSelect} -l {args.logfile}') # here I select the top 10 matches from the blast search of Betavirinae strains found for the protein in the blast serach; only the top 10 most significant mathces (HPSs) are selected from as the tblastn command in the blast.sh and testBlast.sh file deal with this using the flag max_hps=1
else: # this will run if using the testData for the test run; it uses the same scripts with some exceptions so the comments are the same as above and mentioned as such
os.system('chmod +x ./scripts/testData.sh') # I wrote a separate shell file that doesn't require downloading fastq files since samples will be provided; this script will create the necessary folders for the testrun and extract the data needed from the testData folder provided
os.system(f'./scripts/testData.sh {args.logfile} {args.testData}') # this will run the script explained above which is made executable above
os.system(f'python3 ./scripts/retreiveIndex.py -e {args.email} -a {args.index} -o ./PipelineProject_Rohan_Sethi/data/index/raw.fasta -p ./PipelineProject_Rohan_Sethi/data/index/protein.fasta -l {args.logfile}')# here we retreive the index like above
os.system(f'time kallisto index -i ./PipelineProject_Rohan_Sethi/data/index/index.idx ./PipelineProject_Rohan_Sethi/data/index/raw.fasta') # here we create the index like above
# the steps below run kallisto quanitification for each of the samples like mentioned above
meta = pd.read_table(args.metatable)
names = list(meta['sample'])
for fn in names:
f1 = f'./PipelineProject_Rohan_Sethi/data/fastq/{fn}_1.fastq'
f2 = f'./PipelineProject_Rohan_Sethi/data/fastq/{fn}_2.fastq'
os.system(f'time kallisto quant -i ./PipelineProject_Rohan_Sethi/data/index/index.idx -o ./PipelineProject_Rohan_Sethi/results/{fn} -b 30 -t 4 {f1} {f2}')
# the three scripts below serve the same function they did above
os.system(f'python3 ./scripts/quantify.py -m ./PipelineProject_Rohan_Sethi/data/metatable.tsv -r ./PipelineProject_Rohan_Sethi/results -l {args.logfile}')
os.system(f'Rscript ./scripts/diffExpAnalysis.R ./PipelineProject_Rohan_Sethi/data/metatable.tsv {args.logfile} ./PipelineProject_Rohan_Sethi/results/sigDiffExp.tsv')
os.system(f'python3 ./scripts/sigExpSpecies.py -i ./PipelineProject_Rohan_Sethi/data/index/protein.fasta -s ./PipelineProject_Rohan_Sethi/results/sigDiffExp.tsv -o ./PipelineProject_Rohan_Sethi/data/blast/mostDifferentiallyExpressed.fasta') # here I find the most signficant differentially expressed gene from the step above and I store its protein sequence by extracting it from the protein fasta I made of the reference using the protein id
# the scripts below run the blast search using the predownloaded blast db of Betaherpesvirinae in the testData folder, which is why I use a slightly different script for the blasting of test data
os.system('chmod +x ./scripts/testBlast.sh')
os.system(f'./scripts/testBlast.sh ./PipelineProject_Rohan_Sethi/data/blast/mostDifferentiallyExpressed.fasta ./PipelineProject_Rohan_Sethi/data/blast/{args.name} ./PipelineProject_Rohan_Sethi/results/blastResults.csv')
# this serves the same purpose as it did above
os.system(f'python3 ./scripts/selectN.py -i ./PipelineProject_Rohan_Sethi/results/blastResults.csv -n {args.numSelect} -l {args.logfile}')