|
| 1 | +#!/usr/bin/env perl |
| 2 | +use strict; |
| 3 | +use Getopt::Long; |
| 4 | +use Cwd qw/abs_path getcwd cwd/; |
| 5 | +use File::Basename; |
| 6 | + |
| 7 | +my $usage = <<USAGE; |
| 8 | +Usage: |
| 9 | + $0 [options] input.gff3 > bestGeneModels.gff3 2> AS_num_of_codingTranscripts.stats |
| 10 | +
|
| 11 | + 本程序用于对输入的GFF3文件进行分析,若一个基因对应多个转录本,则输出最优的基因模型。 |
| 12 | + 程序的运行原理: |
| 13 | + (1)当一个基因有多个转录本时,首先剔除非编码转录本,仅保留mRNA类型转录本; |
| 14 | + (2)对多个mRNA类型转录本,对其CDS长度进行分析,找到最长的CDS长度,保留CDS长度达到指定阈值(最长CDS长度 * 80%)的mRNA转录本; |
| 15 | + (3)在mRNA信息第九列中寻找Integrity参数值(若未找到,统一设置为 complete)和Transcript_Ratio参数值(若未找到,则统一设置为 1),首先根据Integrity信息,按complete,5prime_partial,3prime_partial,internal排序;再根据Transcript_Ratio数值,按从大到小排序;再按CDS长度从长到短排序,最后按mRNA ID的ASCII码排序,选择排在第一位的mRNA作为最优的基因模型。 |
| 16 | + (4)程序在标准错误输出中给出(可变剪接的)编码转录本数量及其对应的基因数量和ID信息。 |
| 17 | + (4)此外,程序输出的基因顺序和输入文件的基因顺序一致。 |
| 18 | +
|
| 19 | + --ratio_to_maximum_CDS_length <float> default: 0.8 |
| 20 | + 选择最优基因模型时,先保留CDS长度较长的多个转录本,要求这些转录本的长度至少达到最长CDS长度的一定比例。 |
| 21 | +
|
| 22 | + --help default: None |
| 23 | + display this help and exit. |
| 24 | +
|
| 25 | +USAGE |
| 26 | +if (@ARGV==0) {die $usage} |
| 27 | + |
| 28 | +my ($help_flag, $ratio_to_maximum_CDS_length); |
| 29 | +GetOptions( |
| 30 | + "help" => \$help_flag, |
| 31 | + "ratio_to_maximum_CDS_length:f" => \$ratio_to_maximum_CDS_length, |
| 32 | +); |
| 33 | +$ratio_to_maximum_CDS_length ||= 0.8; |
| 34 | + |
| 35 | +if ( $help_flag ) { die $usage } |
| 36 | + |
| 37 | +# 解析GFF3文件内容 |
| 38 | +my %gene_info = &get_geneModels_from_GFF3($ARGV[0]); |
| 39 | + |
| 40 | +# 获得GFF3文件中基因ID的顺序 |
| 41 | +my (@geneID, %geneID); |
| 42 | +open IN, $ARGV[0] or die "Can not open file $ARGV[0], $!"; |
| 43 | +while (<IN>) { |
| 44 | + next if /^\s*$/; |
| 45 | + next if /^#/; |
| 46 | + if (/\tgene\t.*ID=([^;]+)/) { |
| 47 | + push @geneID, $1 unless exists $geneID{$1}; |
| 48 | + $geneID{$1} = 1; |
| 49 | + } |
| 50 | +} |
| 51 | +close IN; |
| 52 | + |
| 53 | +# 对每个基因进行最优基因模型分析 |
| 54 | +my %stats; |
| 55 | +foreach my $gene_ID ( @geneID ) { |
| 56 | + print $gene_info{$gene_ID}{"header"}; |
| 57 | + |
| 58 | + # 分析 mRNA 的CDS长度、完整性和转录本比例信息。 |
| 59 | + my (%mRNA_info, @CDS_length); |
| 60 | + foreach my $mRNA_ID ( @{$gene_info{$gene_ID}{"mRNA_ID"}} ) { |
| 61 | + my $mRNA_header = $gene_info{$gene_ID}{"mRNA_header"}{$mRNA_ID}; |
| 62 | + my @mRNA_header = split /\t/, $mRNA_header; |
| 63 | + next if $mRNA_header[2] ne "mRNA"; |
| 64 | + my $mRNA_info = $gene_info{$gene_ID}{"mRNA_info"}{$mRNA_ID}; |
| 65 | + |
| 66 | + # 计算转录本的CDS信息 |
| 67 | + my @CDS = &get_feature($mRNA_info, "CDS"); |
| 68 | + my $CDS_length = &cal_length(\@CDS); |
| 69 | + push @CDS_length, $CDS_length; |
| 70 | + $mRNA_info{$mRNA_ID}{"CDS_length"} = $CDS_length; |
| 71 | + |
| 72 | + # 分析mRNA的Integrity参数值(若未找到,统一设置为 complete)和Transcript_Ratio参数值(若未找到,则统一设置为 1) |
| 73 | + my ($intergrity, $transcript_ratio) = ("complete", 100); |
| 74 | + $intergrity = $1 if $mRNA_header[-1] =~ m/Integrity=(\w+)/; |
| 75 | + $transcript_ratio = $1 if $mRNA_header[-1] =~ m/Transcript_Ratio=([^;%]+)/; |
| 76 | + #print STDERR "$mRNA_ID\t$intergrity\t$transcript_ratio\t$CDS_length\n"; |
| 77 | + $mRNA_info{$mRNA_ID}{"intergrity"} = $intergrity; |
| 78 | + $mRNA_info{$mRNA_ID}{"transcript_ratio"} = $transcript_ratio; |
| 79 | + } |
| 80 | + |
| 81 | + # 统计编码转录本个数 |
| 82 | + my $as_num = %mRNA_info; |
| 83 | + $stats{$as_num}{"number"} ++; |
| 84 | + push @{$stats{$as_num}{"gene_ID"}}, $gene_ID; |
| 85 | + |
| 86 | + # 仅保留较长CDS长度的mRNA |
| 87 | + @CDS_length = sort {$b <=> $a} @CDS_length; |
| 88 | + foreach ( keys %mRNA_info ) { |
| 89 | + delete $mRNA_info{$_} if $mRNA_info{$_}{"CDS_length"} < $CDS_length[0] * $ratio_to_maximum_CDS_length; |
| 90 | + } |
| 91 | + |
| 92 | + # 对转录本进行排序 |
| 93 | + my %intergrity = ("complete", 1, "5prime_partial", 2, "3prime_partial", 3, "internal", 4); |
| 94 | + my @sort_mRNA_ID = sort { $intergrity{$mRNA_info{$a}{"intergrity"}} <=> $intergrity{$mRNA_info{$b}{"intergrity"}} |
| 95 | + or $mRNA_info{$b}{"transcript_ratio"} <=> $mRNA_info{$a}{"transcript_ratio"} |
| 96 | + or $mRNA_info{$b}{"CDS_length"} <=> $mRNA_info{$a}{"CDS_length"} |
| 97 | + or $a cmp $b } keys %mRNA_info; |
| 98 | + |
| 99 | + # 输出最优基因模型 |
| 100 | + print $gene_info{$gene_ID}{"mRNA_header"}{$sort_mRNA_ID[0]}; |
| 101 | + print $gene_info{$gene_ID}{"mRNA_info"}{$sort_mRNA_ID[0]}; |
| 102 | + print "\n"; |
| 103 | + #print STDERR "Best: $sort_mRNA_ID[0]\t$mRNA_info{$sort_mRNA_ID[0]}{'intergrity'}\t$mRNA_info{$sort_mRNA_ID[0]}{'transcript_ratio'}\t$mRNA_info{$sort_mRNA_ID[0]}{'CDS_length'}\n\n"; |
| 104 | +} |
| 105 | + |
| 106 | +# 编码转录本的可变剪接数量统计 |
| 107 | +my ($all_gene_num, $as_gene_num, $noAS_gene_num) = (0, 0, 0); |
| 108 | +my $out; |
| 109 | +foreach ( sort {$a <=> $b} keys %stats ) { |
| 110 | + $out .= "$_\t$stats{$_}{'number'}\t"; |
| 111 | + $out .= join(",", @{$stats{$_}{"gene_ID"}}); |
| 112 | + $out .= "\n"; |
| 113 | + |
| 114 | + $all_gene_num += $stats{$_}{'number'}; |
| 115 | + if ( $_ == 1 ) { |
| 116 | + $noAS_gene_num += $stats{$_}{'number'}; |
| 117 | + } |
| 118 | + elsif ( $_ > 1 ) { |
| 119 | + $as_gene_num += $stats{$_}{'number'}; |
| 120 | + } |
| 121 | +} |
| 122 | +print STDERR "Total Genes Num is : $all_gene_num\n"; |
| 123 | +print STDERR "No Alternative Spliced Genes Num is : $noAS_gene_num\n"; |
| 124 | +print STDERR "Alternative Spliced Genes Num is : $as_gene_num\n"; |
| 125 | +print STDERR "coding_transcript_num\tgene_num\tgene_ID\n"; |
| 126 | +print STDERR $out; |
| 127 | + |
| 128 | + |
| 129 | +# 子程序,返回基因的GFF3哈希信息: |
| 130 | +# gene_ID => "header" => gene_header |
| 131 | +# gene_ID => "mRNA_ID" => 数组 |
| 132 | +# gene_ID => "mRNA_header" => mRNA_ID => mRNA_header |
| 133 | +# gene_ID => "mRNA_info" => mRNA_ID => mRNA_Info |
| 134 | +sub get_geneModels_from_GFF3 { |
| 135 | + my %gene_info; |
| 136 | + # 第一轮,找gene信息 |
| 137 | + open IN, $_[0] or die "Can not open file $_[0], $!"; |
| 138 | + while (<IN>) { |
| 139 | + if ( m/\tgene\t.*ID=([^;\s]+)/ ) { |
| 140 | + $gene_info{$1}{"header"} = $_; |
| 141 | + } |
| 142 | + } |
| 143 | + close IN; |
| 144 | + # 第二轮,找Parent值是geneID的信息,包含但不限于 mRNA 信息 |
| 145 | + my %mRNA_ID2gene_ID; |
| 146 | + open IN, $_[0] or die "Can not open file $_[0], $!"; |
| 147 | + while (<IN>) { |
| 148 | + if ( m/Parent=([^;\s]+)/ ) { |
| 149 | + my $parent = $1; |
| 150 | + if ( exists $gene_info{$parent} ) { |
| 151 | + if ( m/ID=([^;\s]+)/ ) { |
| 152 | + push @{$gene_info{$parent}{"mRNA_ID"}}, $1; |
| 153 | + $gene_info{$parent}{"mRNA_header"}{$1} = $_; |
| 154 | + $mRNA_ID2gene_ID{$1} = $parent; |
| 155 | + } |
| 156 | + } |
| 157 | + } |
| 158 | + } |
| 159 | + close IN; |
| 160 | + # 第三轮,找Parent值不是geneID的信息 |
| 161 | + open IN, $_[0] or die "Can not open file $_[0], $!"; |
| 162 | + while (<IN>) { |
| 163 | + if ( m/Parent=([^;\s]+)/ && exists $mRNA_ID2gene_ID{$1} ) { |
| 164 | + my $parent = $1; |
| 165 | + $gene_info{$mRNA_ID2gene_ID{$1}}{"mRNA_info"}{$parent} .= $_; |
| 166 | + } |
| 167 | + } |
| 168 | + close IN; |
| 169 | + |
| 170 | + return %gene_info; |
| 171 | +} |
| 172 | + |
| 173 | +sub get_feature { |
| 174 | + my ($info, $feature_name) = @_; |
| 175 | + my @out; |
| 176 | + foreach ( split /\n/, $info ) { |
| 177 | + @_ = split /\t/; |
| 178 | + if ( $_[2] eq $feature_name ) { |
| 179 | + push @out, "$_[3]\t$_[4]"; |
| 180 | + } |
| 181 | + } |
| 182 | + return @out; |
| 183 | +} |
| 184 | + |
| 185 | +sub cal_length { |
| 186 | + my @region = @{$_[0]}; |
| 187 | + my $out; |
| 188 | + foreach (@region) { |
| 189 | + @_ = split /\t/; |
| 190 | + $out += abs($_[1] - $_[0]) + 1; |
| 191 | + } |
| 192 | + return $out; |
| 193 | +} |
| 194 | + |
0 commit comments