-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbuild.cmd
126 lines (96 loc) · 6.61 KB
/
build.cmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@echo off
SET base=P:\huang
SET sp_name=Genus_species
REM Requirements
REM
REM 1. GCModeller X64
REM 2. CMD.Internals toolkits
REM Whitespace and non-ascii character and symbols should avoid in the path string value.
REM config the genome data source and the virtual cell model output path
SET genome="%base%\genome"
SET chromosome_gbk=%genome%\huang.gbk
SET chromosome_id=huang
SET model_output="%base%\bacterial.GCmarkup"
REM config KEGG repository data directory location at here
SET br08901_maps="P:\91001_GB\KGML\br08901"
SET KEGG_cpds="P:\91001_GB\KGML\KEGG_cpd"
SET br08201_network="P:\91001_GB\KGML\br08201"
SET maps="P:\91001_GB\KGML\Maps"
REM config myva COG database files location
SET myva="E:\GCModeller-repo\COGs\Myva\myva"
SET whog="E:\GCModeller-repo\COGs\Myva\whog.XML"
REM config for UniProt reference sequence database
SET uniprot="P:\91001_GB\KO\uniprot-bacteria.KO.faa"
REM config for RegPrecise database repository
SET regpreciseMotifs="P:\91001_GB\transcript_regulations\Regprecise.motifs.fasta"
SET regpreciseRegulators="P:\91001_GB\transcript_regulations\regulators\KEGG_genomes.fasta"
SET regprecise="P:\91001_GB\transcript_regulations\RegpreciseDownloads"
REM config for the component output directory
SET KO_base=%base%\KO
SET COG_base=%base%\COG
SET TF_base=%base%\transcript_regulations
mkdir %KO_base%
mkdir %COG_base%
mkdir %TF_base%
REM first of all
REM Extract sequence data and genome context files
foreach *.gbk in %genome% do localblast /Export.gb /gb $file /out %genome%
REM KO annotation base on SBH
mkdir "%KO_base%\blastp"
mkdir "%COG_base%\blastp"
REM blastp procedures
makeblastdb -in "%uniprot%" -dbtype prot
foreach dir in %genome% do makeblastdb -in "$file/%sp_name%.faa" -dbtype prot
foreach dir in %genome% do blastp -query "$file/%sp_name%.faa" -db %uniprot% -evalue 1e-5 -num_threads 8 -out "%KO_base%\blastp\$basename.txt"
REM and then export raw blastp output result using GCModeller CLI tools.
foreach *.txt in "%KO_base%\blastp" do localblast /SBH.Export.Large /in $file /out "%KO_base%\KO_align_sbh\$basename.csv" /s.pattern "tokens | first" /q.pattern "tokens | 4" /identities 0.15 /coverage 0.5 /top.best
foreach *.csv in "%KO_base%\KO_align_sbh" do eggHTS /proteins.KEGG.plot /in "$file" /field "hit_name" /geneId.field "query_name" /size 2600,2200 /tick -1 /out "%KO_base%\KO_profiles\$basename"
REM mark the gene hits on KEGG pathway maps
foreach dir in "%KO_base%\KO_profiles" do eggHTS /KEGG.Color.Pathway /in "$file/KOCatalogs.csv" /ref %maps% /out "$file/maps"
REM COG annotation base on SBH
foreach dir in %genome% do blastp -query "$file/%sp_name%.faa" -db %myva% -evalue 1e-5 -num_threads 8 -out "%COG_base%\blastp\$basename.txt"
foreach *.txt in "%COG_base%\blastp" do localblast /COG.myva /blastp $file /whog %whog% /top.best /grep "tokens | 4" /out "%COG_base%\profiles\$basename.csv"
foreach *.csv in "%COG_base%\profiles" do eggHTS /COG.profiling.plot /in "$file" /size 2000,1300 /out "%COG_base%\$basename.png"
REM motif predicts
foreach dir in %genome% do makeblastdb -in "$file/%sp_name%.fna" -dbtype nucl
mkdir "%TF_base%\motifs\blastn"
mkdir "%TF_base%\regulators\blastp"
REM blastn mappings
REM no evalue filter
foreach dir in %genome% do blastn -query %regpreciseMotifs% -db "$file/%sp_name%.fna" -word_size 5 -out "%TF_base%\motifs\blastn\$basename.txt" /@set "/parallel=8;/clr=false"
REM export blastn mapping result and do motif tree cluster for the predictions
foreach *.txt in "%TF_base%\motifs\blastn" do localblast /Export.blastnMaps /in $file /out "%TF_base%\motifs\mappings\$basename.csv"
foreach *.csv in "%TF_base%\motifs\mappings" do VirtualFootprint /scan.blastn.map.motifsite /in $file /hits.base 5 /out "%TF_base%\motifs\sites\$basename.csv"
foreach *.csv in "%TF_base%\motifs\sites" do VirtualFootprint /Site.match.genes /in $file /genome "%genome%\$basename\%sp_name%.gbff" /max.dist 250 /replicon "locus" /out "%TF_base%\motifs\contexts\$basename.csv" /skip.RNA
REM TF regulators predictions
makeblastdb -in %regpreciseRegulators% -dbtype prot
foreach dir in %genome% do blastp -query "$file/%sp_name%.faa" -db %regpreciseRegulators% -evalue 1e-5 -num_threads 7 -out "%TF_base%\regulators\blastp\$basename.txt"
REM sbh method
REM due to the reason of orthology can be inherits from multiple source
REM So in this step, no top best hits limitation
REM This will resulted multiple genome source regulation network that will be created in next step
foreach *.txt in "%TF_base%\regulators\blastp" do localblast /SBH.Export.Large /in $file /out "%TF_base%\regulators\sbh\$basename.csv" /s.pattern "tokens ' ' first" /q.pattern "tokens | 4" /identities 0.6 /coverage 0.8
REM regulator annotations
foreach *.csv in "%TF_base%\regulators\sbh" do regprecise /regulators.bbh /bbh $file /regprecise %regprecise% /sbh /allow.multiple /description %regpreciseRegulators% /out "%TF_base%\regulators\mappings\$basename.csv"
REM if want to union the regulator predicts data between different replicons
REM the using
REM Excel /rbind /in "%TF_base%\regulators\mappings\%chromosome_id%.csv" /out %TF_union%
REM build TF regulation network after we have create the motif site and TF predictions
foreach *.csv in "%TF_base%\regulators\mappings" do VirtualFootprint /regulation.footprints /regulator "$file" /footprint "%TF_base%\motifs\contexts\$basename.csv" /out "%TF_base%\result_networks\$basename.csv"
REM TF from chromosome regulates genes from plasmids
foreach *.csv in "%TF_base%\motifs\contexts" do VirtualFootprint /regulation.footprints /regulator "%TF_base%\regulators\mappings\%chromosome_id%.csv" /footprint "$file" /out "%TF_base%\result_networks\$basename.csv"
REM chromosome map plot
mapplot /Config.Template /out "%genome%\plot\config.inf"
mapplot --Draw.ChromosomeMap.genbank /gb "%chromosome_gbk%" /motifs "%TF_base%\motifs\contexts\%chromosome_id%.csv" /hide.mics /conf "%genome%\plot\config.inf" /out "%genome%\plot" /COG "%COG_base%\profiles\%chromosome_id%.csv"
REM compile virtual cell data model
SET KO_union="P:\91001_GB\KO\NC_005810.KO.csv"
SET TF_union="P:\91001_GB\transcript_regulations\NC_005810.tf_regulations.csv"
SET GB_union="P:\91001_GB\NC_005810.genbank"
REM first of all, union the chromosome and plasmids genome data
ncbi_tools /gbff.union /in %genome% /out %GB_union%
REM and then, union all KEGG annotation data
Excel /rbind /in "%KO_base%\KO_profiles\*\KOCatalogs.csv" /out %KO_union%
REM at last, union all of the TF-regulation data
Excel /rbind /in "%TF_base%\result_networks" /out %TF_union%
REM finally, we are able to build this virtual cell model
GCC /compile.KEGG /in %GB_union% /KO %KO_union% /maps %br08901_maps% /compounds %KEGG_cpds% /reactions %br08201_network% /regulations %TF_union% /out %model_output%