diff --git a/bin/homolog_genewise b/bin/homolog_genewise index d82522a..fe99d75 100755 --- a/bin/homolog_genewise +++ b/bin/homolog_genewise @@ -149,7 +149,7 @@ unless (-e "2.blast.ok") { if (($segmentSize - $_[9] + 1) < 1000) { $keep = 0; } $_[8] += $pos; - $_[9] += + $pos; + $_[9] += $pos; print OUT (join "\t", @_); } close OUT; diff --git a/bin/homolog_predict.pl b/bin/homolog_predict.pl deleted file mode 100755 index 406513b..0000000 --- a/bin/homolog_predict.pl +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use Getopt::Long; -use Cwd qw/abs_path getcwd cwd/; -use File::Basename; - -my $usage = < out.gff - - --cpu default: 8 - 设置diamond程序使用线程数,genewise或gth命令运行的并行数。 - - --coverage_ratio default: 0.4 - 对diamond blastx的结果进行过滤,要求对同源蛋白的覆盖率不小于该值。 - - --evalue default: 1e-9 - 对diamond blastx的结果进行过滤,要求对同源蛋白的evalue值不大于该值。 - - --method default: gth - 设置使用同源蛋白进行基因预测的方法,可设置值为gth或genewise。一般情况下,genewise进行基因预测的耗时是gth的五倍。 - - --tmp_dir default: tmp_\$date\$pid - 程序运行时临时文件夹名称。 - - --help default: None - display this help and exit. - -USAGE -if (@ARGV==0){die $usage} - -my ($cpu, $coverage_ratio, $evalue, $method, $tmp_dir, $help_flag); -GetOptions( - "cpu:i" => \$cpu, - "coverage_ratio:f" => \$coverage_ratio, - "evalue:f" => \$evalue, - "method:s" => \$method, - "tmp_dir:s" => \$tmp_dir, - "help" => \$help_flag, -); -$cpu ||= 8; -$coverage_ratio ||= 0.4; -$evalue ||= 1e-9; -$method ||= "gth"; -unless ($method eq "gth" or $method eq "genewise") { - die "--method was set wrong, it should be set to gth or genewise. \n"; -} -my $date = `date +%Y%m%d%H%M%S`; chomp($date); -$tmp_dir ||= "tmp_$date$$"; -$tmp_dir = abs_path($tmp_dir); -mkdir $tmp_dir unless -e $tmp_dir; - -if ( $help_flag ) { die $usage } - -my $input_protein = abs_path($ARGV[0]); -my $input_genome = abs_path($ARGV[1]); - -my $pwd = `pwd`; print STDERR "##########\nPWD (Current Directory): $pwd"; -print STDERR (localtime) . "CMD (Main Program): $0 " . join(" ", @ARGV) . "\n##########\n\n"; - - -# 1. 运行diamond分析,寻找蛋白序列和参考基因组的匹配位点。 -print STDERR "1. align genome sequences against homolog proteins through diamond software.\n"; -# 1.1 运行 diamond makedb 命令,将蛋白序列做成数据库。 -print STDERR "1.1 run diamond makedb, get a protein database.\n"; -chdir $tmp_dir; -my $pwd = `pwd`; print STDERR "PWD: $pwd"; -my $cmdString = "diamond makedb --threads $cpu --db homolog --in $input_protein 2> 1.1.diamond_makedb.log"; -unless (-e "1.1.diamond_makedb.ok") { - print STDERR (localtime) . ": CMD: $cmdString\n"; - system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; - open OUT, ">", "1.1.diamond_makedb.ok" or die $!; close OUT; -} -else { - print STDERR "CMD(Skipped): $cmdString\n"; -} - -# 1.2 运行 diamond blastx 命令将基因组序列比对到蛋白序列数据库上。注意 --max-target-seqs 参数设置为蛋白序列的数量。 -print STDERR "1.2 split genome sequences, and get diamond blastx commands.\n"; -unless ( -e "1.2.split_genome_seqs_and_get_cmds.ok" ) { - # 获取蛋白序列数量 - open IN, $input_protein or die "Can not open file $input_protein, $!"; - my $protein_number = 0; - while () { - $protein_number ++ if m/^>/; - } - close IN; - # 将基因组序列分割成单独的序列,得到diamond blastx命令 - open IN, $input_genome or die "Can not open file $input_genome, !"; - my (%seq, $seq_ID); - while () { - if (m/^>(\S+)/) { - $seq_ID = $1; - } - else { - chomp; - $seq{$seq_ID} .= $_; - } - } - close IN; - mkdir "split_genome_seqs" unless -e "split_genome_seqs"; - open CMD1, ">", "split_genome_seqs/command.diamond_blastx.list" or die "Can not create file split_genome_seq/command.diamond_blastx.list, $!"; - open CMD2, ">", "split_genome_seqs/command.parsing_blast_result.list" or die "Can not create file split_genome_seq/command.parsing_blast_result.list, $!"; - foreach ( keys %seq ) { - open OUT, ">", "split_genome_seqs/$_.fa" or die "Can not create file split_genome_seq/$_.fa, $!"; - print OUT ">$_\n$seq{$_}\n"; - close OUT; - print CMD1 "diamond blastx --db homolog --query split_genome_seqs/$_.fa --out split_genome_seqs/$_.xml --outfmt 5 --sensitive --max-target-seqs $protein_number --evalue 1e-3 --id 10 2> split_genome_seqs/$_.log\n"; - print CMD2 "parsing_blast_result.pl --type xml --no-header --max-hit-num $protein_number --evalue $evalue --subject-coverage $coverage_ratio --query-coverage 0 split_genome_seqs/$_.xml > split_genome_seqs/$_.tab\n"; - } - close CMD1; close CMD2; - open OUT, ">", "1.2.split_genome_seqs_and_get_cmds.ok" or die $!; close OUT; -} -else { - print STDERR "This step was skipped, for the file 1.2.split_genome_seqs_and_get_cmds.ok exists.\n"; -} - -# 1.3 并行化运行 diamond blastx 命令 -# 按内存余量确定 diamond 的并行化数量。按单个 dianmond 命令峰值消耗 60 Gb 内存。 -my $MemAvailable = &get_MemAvailable(); -my $paraFly_CPU = 1; -$paraFly_CPU = $cpu / 4 if $paraFly_CPU < ($cpu / 4); -$paraFly_CPU = $MemAvailable / 60000000 if $paraFly_CPU > ($MemAvailable / 60000000); -$paraFly_CPU = 1 if $paraFly_CPU < 1; -my $cmdString = "ParaFly -c split_genome_seqs/command.diamond_blastx.list -CPU $paraFly_CPU 2> 1.3.ParaFly_diamond_blastx.log"; -unless ( -e "1.3.ParaFly_diamond_blastx.ok" ) { - print STDERR (localtime) . ": CMD: $cmdString\n"; - if ( system("$cmdString") != 0 ) { - $cmdString = "ParaFly -c split_genome_seqs/command.diamond_blastx.list -CPU 1 2>> 1.3.ParaFly_diamond_blastx.log"; - print STDERR (localtime) . ": CMD: $cmdString\n"; - system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; - } - open OUT, ">", "1.3.ParaFly_diamond_blastx.ok" or die $!; close OUT; -} -else { - print STDERR "CMD(Skipped): $cmdString\n"; -} - -# 1.4 解析 xml 文件,得到基因组区域及其匹配的氨基酸序列名称。 -$cmdString = "ParaFly -c split_genome_seqs/command.parsing_blast_result.list -CPU $cpu 2> 1.4.ParaFly_parsing_blast_result.log"; -unless ( -e "1.4.ParaFly_parsing_blast_result.ok" ) { - print STDERR (localtime) . ": CMD: $cmdString\n"; - if ( system("$cmdString") != 0 ) { - $cmdString = "ParaFly -c split_genome_seqs/command.parsing_blast_result.list -CPU 1 2> 1.4.ParaFly_parsing_blast_result.log"; - print STDERR (localtime) . ": CMD: $cmdString\n"; - system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; - } - open OUT, ">", "1.4.ParaFly_parsing_blast_result.ok" or die $!; close OUT; -} -else { - print STDERR "CMD(Skipped): $cmdString\n"; -} - -# 2. 解析diamond比对结果, -# 2.1 根据同源蛋白的比对结果,推测出物种的基因长度信息。 -print STDERR "2 - -sub get_MemAvailable { - open IN, "/proc/meminfo" or die "Can not open file /proc/meminfo, $!"; - my $MemAvailable; - while () { - if (m/MemAvailable:\s*(\d+)\s*kB/) { - $MemAvailable = $1; - next; - } - } - close IN; - return $MemAvailable; -} diff --git a/bin/homolog_prediction b/bin/homolog_prediction new file mode 100755 index 0000000..c2becb7 --- /dev/null +++ b/bin/homolog_prediction @@ -0,0 +1,104 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; +use Cwd qw/abs_path getcwd cwd/; +use File::Basename; + +my $binPath = dirname($0); + +my $usage = < out.gff + + --out_prefix default: out + 设置输出文件前缀。程序默认输出如下文件:(1)out.diamond.tab,将基因组序列和同源蛋白进行dimond blastx比对的结果文件;(2)out. + + --segmentSize default: 1000000 + --overlapSize default: 100000 + 程序将基因组较长的序列进行分割,以加快diamond blastx的并行化比对速度;若单条序列长度超过1Mb, 则将单条序列进行切割,两条相邻的序列间重叠的长度为100kb。 + + --cpu default: 8 + 设置diamond程序使用线程数,genewise或gth命令运行的并行数。 + + --identity default: 0.2 + --evalue default: 1e-9 + 设置diamond blastx分析时的Identity和E-value阈值。由于diamond blastx进行分析,每个Hit仅给出一个最优的HSP(而NCBI Blast+会给出所有HSPs),因此不需要根据覆盖率来对BLASTX结果进行过滤。 + + --method default: gth + 设置使用同源蛋白进行基因预测的方法,可设置值为gth或genewise。一般情况下,genewise进行基因预测的耗时是gth的五倍。 + + --tmp_dir default: tmp_\$date\$pid + 程序运行时临时文件夹名称。 + + --help default: None + display this help and exit. + +USAGE +if (@ARGV==0){die $usage} + +my ($out_prefix, $segmentSize, $overlapSize, $cpu, $identity, $evalue, $method, $tmp_dir, $help_flag); +GetOptions( + "out_prefix:s" => \$out_prefix, + "segmentSize:i" => \$segmentSize, + "overlapSize:i" => \$overlapSize, + "cpu:i" => \$cpu, + "identity:f" => \$identity, + "evalue:f" => \$evalue, + "method:s" => \$method, + "tmp_dir:s" => \$tmp_dir, + "help" => \$help_flag, +); +$out_prefix ||= "out"; +$segmentSize ||= 1000000; +$overlapSize ||= 100000; +$cpu ||= 8; +$identity ||= 0.2; +$evalue ||= 1e-9; +$method ||= "gth"; +unless ($method eq "gth" or $method eq "genewise") { + die "--method was set wrong, it should be set to gth or genewise. \n"; +} +my $date = `date +%Y%m%d%H%M%S`; chomp($date); +$tmp_dir ||= "tmp_$date$$"; +$tmp_dir = abs_path($tmp_dir); +mkdir $tmp_dir unless -e $tmp_dir; + +if ( $help_flag ) { die $usage } + +my $input_protein = abs_path($ARGV[0]); +my $input_genome = abs_path($ARGV[1]); + +my $pwd = `pwd`; print STDERR "##########\nPWD (Current Directory): $pwd"; +print STDERR (localtime) . "CMD (Main Program): $0 " . join(" ", @ARGV) . "\n##########\n\n"; + + +# 1. 运行diamond分析,寻找蛋白序列和参考基因组的匹配位点。 +print STDERR "1. align genome sequences against homolog proteins through diamond software.\n"; +chdir $tmp_dir; +my $pwd = `pwd`; print STDERR "PWD: $pwd"; +my $cmdString = "$binPath/homolog_prediction.01paraDiamond --segmentSize $segmentSize --overlapSize $overlapSize --cpu $cpu --identity $identity --evalue $evalue --tmp_dir a.para_diamond $input_protein $input_genome > $out_prefix.diamond.tab 2> a.para_diamond.log"; +unless (-e "1.paraDiamond.ok") { + print STDERR (localtime) . ": CMD: $cmdString\n"; + system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; + open OUT, ">", "1.paraDiamond.ok" or die $!; close OUT; +} +else { + print STDERR "CMD(Skipped): $cmdString\n"; +} + +# 2. 过滤被包含匹配区域中得分过低的比对结果。 +print STDERR "2. filtering the low quality alignments when contained in another match region.\n"; +my $cmdString = "$binPath/homolog_prediction.02filtringLowQualityHits "; +unless (-e "2.filtringLowQualityHits.ok") { + print STDERR (localtime) . ": CMD: $cmdString\n"; + system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; + open OUT, ">", "2.filtringLowQualityHits.ok" or die $!; close OUT; +} +else { + print STDERR "CMD(Skipped): $cmdString\n"; +} + +# 3. 调用genewise(wise)进行基因预测。 +# 4. 调用gth(genomethreader)进行基因预测。 +# 5. 合并所有的基因预测结果,若一个区域有多个蛋白预测的结果,取其一致性结果。 diff --git a/bin/homolog_prediction.01paraDiamond b/bin/homolog_prediction.01paraDiamond new file mode 100755 index 0000000..339f3fb --- /dev/null +++ b/bin/homolog_prediction.01paraDiamond @@ -0,0 +1,195 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; +use Cwd qw/abs_path getcwd cwd/; +use File::Basename; + +my $usage = < out.tab + + 本程序用于将全基因组序列和临近物种的同源蛋白使用diamond blastx进行比对,得到表格结果。若使用dianmond命令直接对单个全基因组Fasta文件进行分析,单个diamond命令并不能一直占用单台服务器全部CPU线程,有很长时间的单线程运行阶段,从而导致程序运行速度较慢。因此,推荐使用本程序将基因组序列进行分割后并行化计算,再合并结果,从而加快计算速度。 + + --segmentSize default: 1000000 + --overlapSize default: 100000 + 程序将基因组较长的序列进行分割,以加快diamond blastx的并行化比对速度;若单条序列长度超过1Mb, 则将单条序列进行切割,两条相邻的序列间重叠的长度为100kb。 + + --cpu default: 8 + 设置CPU线程数。本程序调用diamond命令时未设置--threads参数,表明单个diamond默认使用所有CPU线程;但diamond程序后半部分往往对CPU线程利用较少,往往只能使用4个CPU线程,因此本程序并行化运行diamond的数量为--cpu参数设置的值除以4;此外,考虑到diamond比较消耗内存,单个diamond程序消耗内存峰值10G,并行化运行diamond的数量最高为剩余内存量除以10G;当调用ParaFly并行化运行失败后,程序会使用1个并行数来运行diamond命令。 + + --identity default: 0.2 + --evalue default: 1e-9 + 设置diamond blastx分析时的Identity和E-value阈值。由于diamond blastx进行分析,每个Hit仅给出一个最优的HSP(而NCBI Blast+会给出所有HSPs),因此不需要根据覆盖率来对BLASTX结果进行过滤。 + + --tmp_dir default: tmp_\$date\$pid + 程序运行时临时文件夹名称。 + + --help default: None + display this help and exit. + +USAGE +if (@ARGV==0){die $usage} + +my ($segmentSize, $overlapSize, $cpu, $identity, $evalue, $tmp_dir, $help_flag); +GetOptions( + "segmentSize:i" => \$segmentSize, + "overlapSize:i" => \$overlapSize, + "cpu:i" => \$cpu, + "identity:f" => \$identity, + "evalue:f" => \$evalue, + "tmp_dir:s" => \$tmp_dir, + "help" => \$help_flag, +); +$segmentSize ||= 1000000; +$overlapSize ||= 100000; +$cpu ||= 8; +$identity ||= 0.2; +$evalue ||= 1e-9; +my $date = `date +%Y%m%d%H%M%S`; chomp($date); +$tmp_dir ||= "tmp_$date$$"; +$tmp_dir = abs_path($tmp_dir); +mkdir $tmp_dir unless -e $tmp_dir; + +if ( $help_flag ) { die $usage } + +my $input_protein = abs_path($ARGV[0]); +my $input_genome = abs_path($ARGV[1]); + +my $pwd = `pwd`; print STDERR "##########\nPWD (Current Directory): $pwd"; +print STDERR (localtime) . "CMD (Main Program): $0 " . join(" ", @ARGV) . "\n##########\n\n"; + + +# 1 运行 diamond makedb 命令,将蛋白序列做成数据库。 +print STDERR "1. run diamond makedb, get a protein database.\n"; +chdir $tmp_dir; +my $pwd = `pwd`; print STDERR "PWD: $pwd"; +my $cmdString = "diamond makedb --threads $cpu --db homolog --in $input_protein 2> 1.diamond_makedb.log"; +unless (-e "1.diamond_makedb.ok") { + print STDERR (localtime) . ": CMD: $cmdString\n"; + system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; + open OUT, ">", "1.diamond_makedb.ok" or die $!; close OUT; +} +else { + print STDERR "CMD(Skipped): $cmdString\n"; +} + +# 2 运行 diamond blastx 命令将基因组序列比对到蛋白序列数据库上。注意 --max-target-seqs 参数设置为蛋白序列的数量。 +print STDERR "2. split genome sequences, and get diamond blastx commands.\n"; +unless ( -e "2.split_genome_seqs_and_get_cmds.ok" ) { + # 获取蛋白序列数量 + open IN, $input_protein or die "Can not open file $input_protein, $!"; + my $protein_number = 0; + while () { + $protein_number ++ if m/^>/; + } + close IN; + # 将基因组序列按长度进行分割,得到diamond blastx命令 + open IN, $input_genome or die "Can not open file $input_genome, !"; + my (%seq, $seq_ID); + while () { + chomp; + if (m/^>(\S+)/) { $seq_ID = $1; } + else { $seq{$seq_ID} .= $_; } + } + close IN; + mkdir "split_genome_seqs" unless -e "split_genome_seqs"; + open CMD, ">", "split_genome_seqs/command.diamond_blastx.list" or die "Can not create file split_genome_seq/command.diamond_blastx.list, $!"; + my @seq_ID; + foreach my $seq_id ( sort keys %seq ) { + my $seq = $seq{$seq_id}; + my $seq_length = length $seq; + if ($seq_length > $segmentSize) { + my @partion = &get_partition($seq_length, $segmentSize, $overlapSize); + foreach (@partion) { + my $start = $_ - 1; + my $end = $start + $segmentSize; + my $sub_seq = substr($seq, $start, $segmentSize); + open OUT, ">", "split_genome_seqs/$seq_id.$start.fa" or die "Can not create file split_genome_seqs/$seq_id.$start.fa, $!\n"; + print OUT ">$seq_id\_start_$start\n$sub_seq\n"; + close OUT; + push @seq_ID, "$seq_id.$start"; + } + } + else { + open OUT, ">", "split_genome_seqs/$seq_id.0.fa" or die "Can not create file split_genome_seqs/$seq_id.0.fa, $!\n"; + print OUT ">$seq_id\_start_0\n$seq\n"; + close OUT; + push @seq_ID, "$seq_id.0"; + } + } + foreach ( @seq_ID ) { + print CMD "diamond blastx --db homolog --query split_genome_seqs/$_.fa --out split_genome_seqs/$_.out --outfmt 6 --sensitive --max-target-seqs $protein_number --evalue $evalue --id $identity 2> split_genome_seqs/$_.log\n"; + } + close CMD; + open OUT, ">", "2.split_genome_seqs_and_get_cmds.ok" or die $!; close OUT; + my $sequence_number = @seq_ID; + print STDERR "The genome was split to $sequence_number sequences.\n"; +} +else { + print STDERR "This step was skipped, for the file 2.split_genome_seqs_and_get_cmds.ok exists.\n"; +} + +# 3 并行化运行 diamond blastx 命令 +# 按内存余量确定 diamond 的并行化数量。按单个 dianmond 命令峰值消耗 10 Gb 内存。 +print STDERR "3. run diamond blastx.\n"; +my $MemAvailable = &get_MemAvailable(); +my $paraFly_CPU = 1; +$paraFly_CPU = $cpu / 4 if $paraFly_CPU < ($cpu / 4); +$paraFly_CPU = $MemAvailable / 10000000 if $paraFly_CPU > ($MemAvailable / 10000000); +$paraFly_CPU = 1 if $paraFly_CPU < 1; +$paraFly_CPU = int($paraFly_CPU + 0.5); +my $cmdString = "ParaFly -c split_genome_seqs/command.diamond_blastx.list -CPU $paraFly_CPU 2> 3.ParaFly_diamond_blastx.log"; +unless ( -e "1.3.ParaFly_diamond_blastx.ok" ) { + print STDERR (localtime) . ": CMD: $cmdString\n"; + if ( system("$cmdString") != 0 ) { + $cmdString = "ParaFly -c split_genome_seqs/command.diamond_blastx.list -CPU 1 2>> 3.ParaFly_diamond_blastx.log"; + print STDERR (localtime) . ": CMD: $cmdString\n"; + system("$cmdString") == 0 or die "failed to execute: $cmdString\n"; + } + open OUT, ">", "3.ParaFly_diamond_blastx.ok" or die $!; close OUT; +} +else { + print STDERR "CMD(Skipped): $cmdString\n"; +} + +# 4 汇总dimond的比对结果,并修正匹配的位置信息(之前打断基因组序列后,比对的位置需要进行修正)。 +print STDERR "4. output the alignment results.\n"; +foreach (<"split_genome_seqs/*.out">) { + open IN, $_ or die "Can not open file $_, $!"; + while () { + @_ = split /\t/; + my $pos = 0; + $pos = $1 if $_[0] =~ s/_start_(\d+)//; + $_[6] += $pos; + $_[7] += $pos; + print (join "\t", @_); + } + close IN; +} + +sub get_MemAvailable { + open IN, "/proc/meminfo" or die "Can not open file /proc/meminfo, $!"; + my $MemAvailable; + while () { + if (m/MemAvailable:\s*(\d+)\s*kB/) { + $MemAvailable = $1; + next; + } + } + close IN; + return $MemAvailable; +} + +sub get_partition { + my ($length, $ss, $os) = @_; + my @out; + + my $pos = 1; + push @out, $pos; + while (($pos + $ss - 1) < $length) { + $pos = $pos + $ss - 1 - $os + 1; + push @out, $pos; + } + return @out; +}