Skip to content

Commit 512b0ef

Browse files
author
chenlianfu
committed
submit 20221122
1 parent e8c925d commit 512b0ef

File tree

3 files changed

+207
-4
lines changed

3 files changed

+207
-4
lines changed

bin/GFF3_extract_bestGeneModels

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
#!/usr/bin/env perl
2+
use strict;
3+
use Getopt::Long;
4+
use Cwd qw/abs_path getcwd cwd/;
5+
use File::Basename;
6+
7+
my $usage = <<USAGE;
8+
Usage:
9+
$0 [options] input.gff3 > bestGeneModels.gff3 2> AS_num_of_codingTranscripts.stats
10+
11+
本程序用于对输入的GFF3文件进行分析,若一个基因对应多个转录本,则输出最优的基因模型。
12+
程序的运行原理:
13+
(1)当一个基因有多个转录本时,首先剔除非编码转录本,仅保留mRNA类型转录本;
14+
(2)对多个mRNA类型转录本,对其CDS长度进行分析,找到最长的CDS长度,保留CDS长度达到指定阈值(最长CDS长度 * 80%)的mRNA转录本;
15+
(3)在mRNA信息第九列中寻找Integrity参数值(若未找到,统一设置为 complete)和Transcript_Ratio参数值(若未找到,则统一设置为 1),首先根据Integrity信息,按complete,5prime_partial,3prime_partial,internal排序;再根据Transcript_Ratio数值,按从大到小排序;再按CDS长度从长到短排序,最后按mRNA ID的ASCII码排序,选择排在第一位的mRNA作为最优的基因模型。
16+
(4)程序在标准错误输出中给出(可变剪接的)编码转录本数量及其对应的基因数量和ID信息。
17+
(4)此外,程序输出的基因顺序和输入文件的基因顺序一致。
18+
19+
--ratio_to_maximum_CDS_length <float> default: 0.8
20+
选择最优基因模型时,先保留CDS长度较长的多个转录本,要求这些转录本的长度至少达到最长CDS长度的一定比例。
21+
22+
--help default: None
23+
display this help and exit.
24+
25+
USAGE
26+
if (@ARGV==0) {die $usage}
27+
28+
my ($help_flag, $ratio_to_maximum_CDS_length);
29+
GetOptions(
30+
"help" => \$help_flag,
31+
"ratio_to_maximum_CDS_length:f" => \$ratio_to_maximum_CDS_length,
32+
);
33+
$ratio_to_maximum_CDS_length ||= 0.8;
34+
35+
if ( $help_flag ) { die $usage }
36+
37+
# 解析GFF3文件内容
38+
my %gene_info = &get_geneModels_from_GFF3($ARGV[0]);
39+
40+
# 获得GFF3文件中基因ID的顺序
41+
my (@geneID, %geneID);
42+
open IN, $ARGV[0] or die "Can not open file $ARGV[0], $!";
43+
while (<IN>) {
44+
next if /^\s*$/;
45+
next if /^#/;
46+
if (/\tgene\t.*ID=([^;]+)/) {
47+
push @geneID, $1 unless exists $geneID{$1};
48+
$geneID{$1} = 1;
49+
}
50+
}
51+
close IN;
52+
53+
# 对每个基因进行最优基因模型分析
54+
my %stats;
55+
foreach my $gene_ID ( @geneID ) {
56+
print $gene_info{$gene_ID}{"header"};
57+
58+
# 分析 mRNA 的CDS长度、完整性和转录本比例信息。
59+
my (%mRNA_info, @CDS_length);
60+
foreach my $mRNA_ID ( @{$gene_info{$gene_ID}{"mRNA_ID"}} ) {
61+
my $mRNA_header = $gene_info{$gene_ID}{"mRNA_header"}{$mRNA_ID};
62+
my @mRNA_header = split /\t/, $mRNA_header;
63+
next if $mRNA_header[2] ne "mRNA";
64+
my $mRNA_info = $gene_info{$gene_ID}{"mRNA_info"}{$mRNA_ID};
65+
66+
# 计算转录本的CDS信息
67+
my @CDS = &get_feature($mRNA_info, "CDS");
68+
my $CDS_length = &cal_length(\@CDS);
69+
push @CDS_length, $CDS_length;
70+
$mRNA_info{$mRNA_ID}{"CDS_length"} = $CDS_length;
71+
72+
# 分析mRNA的Integrity参数值(若未找到,统一设置为 complete)和Transcript_Ratio参数值(若未找到,则统一设置为 1)
73+
my ($intergrity, $transcript_ratio) = ("complete", 100);
74+
$intergrity = $1 if $mRNA_header[-1] =~ m/Integrity=(\w+)/;
75+
$transcript_ratio = $1 if $mRNA_header[-1] =~ m/Transcript_Ratio=([^;%]+)/;
76+
#print STDERR "$mRNA_ID\t$intergrity\t$transcript_ratio\t$CDS_length\n";
77+
$mRNA_info{$mRNA_ID}{"intergrity"} = $intergrity;
78+
$mRNA_info{$mRNA_ID}{"transcript_ratio"} = $transcript_ratio;
79+
}
80+
81+
# 统计编码转录本个数
82+
my $as_num = %mRNA_info;
83+
$stats{$as_num}{"number"} ++;
84+
push @{$stats{$as_num}{"gene_ID"}}, $gene_ID;
85+
86+
# 仅保留较长CDS长度的mRNA
87+
@CDS_length = sort {$b <=> $a} @CDS_length;
88+
foreach ( keys %mRNA_info ) {
89+
delete $mRNA_info{$_} if $mRNA_info{$_}{"CDS_length"} < $CDS_length[0] * $ratio_to_maximum_CDS_length;
90+
}
91+
92+
# 对转录本进行排序
93+
my %intergrity = ("complete", 1, "5prime_partial", 2, "3prime_partial", 3, "internal", 4);
94+
my @sort_mRNA_ID = sort { $intergrity{$mRNA_info{$a}{"intergrity"}} <=> $intergrity{$mRNA_info{$b}{"intergrity"}}
95+
or $mRNA_info{$b}{"transcript_ratio"} <=> $mRNA_info{$a}{"transcript_ratio"}
96+
or $mRNA_info{$b}{"CDS_length"} <=> $mRNA_info{$a}{"CDS_length"}
97+
or $a cmp $b } keys %mRNA_info;
98+
99+
# 输出最优基因模型
100+
print $gene_info{$gene_ID}{"mRNA_header"}{$sort_mRNA_ID[0]};
101+
print $gene_info{$gene_ID}{"mRNA_info"}{$sort_mRNA_ID[0]};
102+
print "\n";
103+
#print STDERR "Best: $sort_mRNA_ID[0]\t$mRNA_info{$sort_mRNA_ID[0]}{'intergrity'}\t$mRNA_info{$sort_mRNA_ID[0]}{'transcript_ratio'}\t$mRNA_info{$sort_mRNA_ID[0]}{'CDS_length'}\n\n";
104+
}
105+
106+
# 编码转录本的可变剪接数量统计
107+
my ($all_gene_num, $as_gene_num, $noAS_gene_num) = (0, 0, 0);
108+
my $out;
109+
foreach ( sort {$a <=> $b} keys %stats ) {
110+
$out .= "$_\t$stats{$_}{'number'}\t";
111+
$out .= join(",", @{$stats{$_}{"gene_ID"}});
112+
$out .= "\n";
113+
114+
$all_gene_num += $stats{$_}{'number'};
115+
if ( $_ == 1 ) {
116+
$noAS_gene_num += $stats{$_}{'number'};
117+
}
118+
elsif ( $_ > 1 ) {
119+
$as_gene_num += $stats{$_}{'number'};
120+
}
121+
}
122+
print STDERR "Total Genes Num is : $all_gene_num\n";
123+
print STDERR "No Alternative Spliced Genes Num is : $noAS_gene_num\n";
124+
print STDERR "Alternative Spliced Genes Num is : $as_gene_num\n";
125+
print STDERR "coding_transcript_num\tgene_num\tgene_ID\n";
126+
print STDERR $out;
127+
128+
129+
# 子程序,返回基因的GFF3哈希信息:
130+
# gene_ID => "header" => gene_header
131+
# gene_ID => "mRNA_ID" => 数组
132+
# gene_ID => "mRNA_header" => mRNA_ID => mRNA_header
133+
# gene_ID => "mRNA_info" => mRNA_ID => mRNA_Info
134+
sub get_geneModels_from_GFF3 {
135+
my %gene_info;
136+
# 第一轮,找gene信息
137+
open IN, $_[0] or die "Can not open file $_[0], $!";
138+
while (<IN>) {
139+
if ( m/\tgene\t.*ID=([^;\s]+)/ ) {
140+
$gene_info{$1}{"header"} = $_;
141+
}
142+
}
143+
close IN;
144+
# 第二轮,找Parent值是geneID的信息,包含但不限于 mRNA 信息
145+
my %mRNA_ID2gene_ID;
146+
open IN, $_[0] or die "Can not open file $_[0], $!";
147+
while (<IN>) {
148+
if ( m/Parent=([^;\s]+)/ ) {
149+
my $parent = $1;
150+
if ( exists $gene_info{$parent} ) {
151+
if ( m/ID=([^;\s]+)/ ) {
152+
push @{$gene_info{$parent}{"mRNA_ID"}}, $1;
153+
$gene_info{$parent}{"mRNA_header"}{$1} = $_;
154+
$mRNA_ID2gene_ID{$1} = $parent;
155+
}
156+
}
157+
}
158+
}
159+
close IN;
160+
# 第三轮,找Parent值不是geneID的信息
161+
open IN, $_[0] or die "Can not open file $_[0], $!";
162+
while (<IN>) {
163+
if ( m/Parent=([^;\s]+)/ && exists $mRNA_ID2gene_ID{$1} ) {
164+
my $parent = $1;
165+
$gene_info{$mRNA_ID2gene_ID{$1}}{"mRNA_info"}{$parent} .= $_;
166+
}
167+
}
168+
close IN;
169+
170+
return %gene_info;
171+
}
172+
173+
sub get_feature {
174+
my ($info, $feature_name) = @_;
175+
my @out;
176+
foreach ( split /\n/, $info ) {
177+
@_ = split /\t/;
178+
if ( $_[2] eq $feature_name ) {
179+
push @out, "$_[3]\t$_[4]";
180+
}
181+
}
182+
return @out;
183+
}
184+
185+
sub cal_length {
186+
my @region = @{$_[0]};
187+
my $out;
188+
foreach (@region) {
189+
@_ = split /\t/;
190+
$out += abs($_[1] - $_[0]) + 1;
191+
}
192+
return $out;
193+
}
194+

bin/geta.pl

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,8 @@
278278
mkdir "0.RepeatMasker" unless -e "0.RepeatMasker";
279279
unless (-e "0.RepeatMasker.ok") {
280280
chdir "0.RepeatMasker";
281-
mkdir "repeatMasker" unless -e "repeatMasker";
282-
chdir "repeatMasker";
281+
mkdir "repeatMasker" unless -e "repeatMasker";
282+
chdir "repeatMasker";
283283
$pwd = `pwd`; print STDERR "PWD: $pwd";
284284

285285
# 进行RepeatMasker分析
@@ -297,7 +297,7 @@
297297
else {
298298
print STDERR "CMD(Skipped): $cmdString\n";
299299
}
300-
chdir "../";
300+
chdir "../";
301301

302302
# 进行RepeatModeler分析
303303
mkdir "repeatModeler" unless -e "repeatModeler";
@@ -1494,6 +1494,10 @@
14941494
print STDERR (localtime) . ": CMD: $cmdString\n";
14951495
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
14961496

1497+
$cmdString = "$dirname/bin/GFF3_extract_bestGeneModels $out_prefix.GeneModels.gff3 > $out_prefix.bestGeneModels.gff3 2> $out_prefix.AS_num_of_codingTranscripts.stats";
1498+
print STDERR (localtime) . ": CMD: $cmdString\n";
1499+
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
1500+
14971501
$cmdString = "$dirname/bin/GFF3Clear --GFF3_source GETA --gene_prefix ${out_prefix}ncGene --gene_code_length 6 --genome $genome --no_attr_add $out_prefix.tmp/6.combineGeneModels/geneModels.h.lncRNA.gff3 > $out_prefix.GeneModels_lncRNA.gff3 2> /dev/null";
14981502
print STDERR (localtime) . ": CMD: $cmdString\n";
14991503
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
@@ -1508,7 +1512,11 @@
15081512
print STDERR (localtime) . ": CMD: $cmdString\n";
15091513
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
15101514

1511-
$cmdString = "$dirname/bin/eukaryotic_gene_model_statistics.pl $out_prefix.GeneModels.gtf $genome $out_prefix &> $out_prefix.GeneModels.stats";
1515+
$cmdString = "$dirname/bin/gff3ToGtf.pl $genome $out_prefix.bestGeneModels.gff3 > $out_prefix.bestGeneModels.gtf 2> /dev/null";
1516+
print STDERR (localtime) . ": CMD: $cmdString\n";
1517+
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
1518+
1519+
$cmdString = "$dirname/bin/eukaryotic_gene_model_statistics.pl $out_prefix.bestGeneModels.gtf $genome $out_prefix &> $out_prefix.GeneModels.stats";
15121520
print STDERR (localtime) . ": CMD: $cmdString\n";
15131521
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
15141522

bin/para_hmmscan

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Usage:
99
程序调用hmmscan将protein.fasta文件中的蛋白序列比对到hmm数据库中,并将hmmscan的domtbl结果进行过滤并输出到标准输出。
1010
调用hmmscan进行比对时,使用--cut_ga参数,即根据hmm文件中自带的GA类型的bit scores阈值进行过滤。在hmmscan文档中提示为:GA thresholds are generally considered to be the reliable curated thresholds defining family membership。使用--cut_ga方法能对不同的hmm模型使用不同的阈值进行过滤,但此时hmmscan不能使用evalue方法进行过滤。
1111
在得到hmmscan结果后,进一步使用evalue和coverage进行过滤。若一个蛋白序列和一个hmm模型有多个匹配区域,则计算多个匹配区域去冗余后的长度对hmm模型的覆盖率。
12+
此外,当程序运行中断,或对新的输入文件重新进行分析时,若未删除中间文件,本程序能解析已有的结果,仅对未分析的序列进行hmmscan分析,能节约程序运行时间。最终,程序仅输出输入文件中序列的hmmscan分析结果。
1213
1314
--outformat
1415
当添加该参数时,输出简单的7列表格结果:(1)GeneID;(2)HMM accession,例如Pfam编号;(3)HMM Name;(4)E-value;(5)Score;(6)Coverage;(7)Description。默认情况下,输出的表格格式和hmmscan的domtbl一致且含有大量注释行信息。

0 commit comments

Comments
 (0)