@@ -82,18 +82,21 @@ def equidistant_ranges(length: int, parts: int) -> Iterator[Tuple[int, int]]:
82
82
start = stop
83
83
84
84
85
- def logarithmic_ranges (length : int ):
86
- # Use a scaling factor: this needs 300 units to reach the length of the
85
+ def logarithmic_ranges (length : int , min_distance : int = 5 ):
86
+ """
87
+ Gives a squashed logarithmic range. It is not truly logarithmic as the
88
+ minimum distance ensures that the lower units are more tightly packed.
89
+ """
90
+ # Use a scaling factor: this needs 400 units to reach the length of the
87
91
# largest human chromosome. This will still fit on a graph once we reach
88
- # those sequencing sizes. By utilizing the same scaling factor, this
89
- # program will have more comparable plots between FASTQ files.
90
- scaling_factor = 250_000_000 ** (1 / 300 )
92
+ # those sequencing sizes.
93
+ scaling_factor = 250_000_000 ** (1 / 400 )
91
94
i = 0
92
95
start = 0
93
96
while True :
94
97
stop = round (scaling_factor ** i )
95
98
i += 1
96
- if stop > start :
99
+ if stop >= start + min_distance :
97
100
yield start , stop
98
101
start = stop
99
102
if stop >= length :
@@ -469,9 +472,14 @@ class PerSequenceAverageQualityScores(ReportModule):
469
472
x_labels : Tuple [str , ...] = tuple (str (x ) for x in range (PHRED_MAX + 1 ))
470
473
471
474
def plot (self ) -> str :
475
+ maximum_score = 0
476
+ for i , count in enumerate (self .average_quality_counts ):
477
+ if count > 0 :
478
+ maximum_score = i
479
+ maximum_score = max (maximum_score + 2 , 40 )
472
480
plot = pygal .Bar (
473
481
title = "Per sequence quality scores" ,
474
- x_labels = range (PHRED_MAX + 1 ),
482
+ x_labels = range (maximum_score + 1 ),
475
483
x_labels_major_every = 3 ,
476
484
show_minor_x_labels = False ,
477
485
style = ONE_SERIE_STYLE ,
@@ -482,7 +490,8 @@ def plot(self) -> str:
482
490
total = sum (self .average_quality_counts )
483
491
percentage_scores = [100 * score / total
484
492
for score in self .average_quality_counts ]
485
- plot .add ("" , percentage_scores )
493
+
494
+ plot .add ("" , percentage_scores [:maximum_score ])
486
495
return plot .render (is_unicode = True )
487
496
488
497
def to_html (self ) -> str :
@@ -612,7 +621,9 @@ def to_html(self) -> str:
612
621
@dataclasses .dataclass
613
622
class PerSequenceGCContent (ReportModule ):
614
623
gc_content_counts : Sequence [int ]
624
+ smoothened_gc_content_counts : Sequence [int ]
615
625
x_labels : Sequence [str ] = tuple (str (x ) for x in range (101 ))
626
+ smoothened_x_labels : Sequence [str ] = tuple (str (x ) for x in range (0 , 101 , 2 ))
616
627
617
628
def plot (self ):
618
629
plot = pygal .Bar (
@@ -628,15 +639,46 @@ def plot(self):
628
639
plot .add ("" , self .gc_content_counts )
629
640
return plot .render (is_unicode = True )
630
641
642
+ def smoothened_plot (self ):
643
+ plot = pygal .Line (
644
+ title = "Per sequence GC content (smoothened)" ,
645
+ x_labels = self .smoothened_x_labels ,
646
+ x_labels_major_every = 3 ,
647
+ show_minor_x_labels = False ,
648
+ x_title = "GC %" ,
649
+ y_title = "number of reads" ,
650
+ interpolate = "cubic" ,
651
+ style = ONE_SERIE_STYLE ,
652
+ ** COMMON_GRAPH_OPTIONS ,
653
+ )
654
+ plot .add ("" , self .smoothened_gc_content_counts )
655
+ return plot .render (is_unicode = True )
656
+
631
657
def to_html (self ) -> str :
632
658
return f"""
633
659
<h2>Per sequence GC content</h2>
660
+ For short reads with fixed size (i.e. Illumina) the plot will
661
+ look very spiky due to the GC content calculation: GC bases / all
662
+ bases. For read lengths of 151, both 75 and 76 GC bases lead to a
663
+ percentage of 50% (rounded) and 72 and 73 GC bases leads to 48%
664
+ (rounded). Only 74 GC bases leads to 49%. As a result the
665
+ even categories will be twice as high, which creates a spike. The
666
+ smoothened plot is provided to give a clearer picture in this case.
667
+ <br>
634
668
{ self .plot ()}
669
+ { self .smoothened_plot ()}
635
670
"""
636
671
637
672
@classmethod
638
673
def from_qc_metrics (cls , metrics : QCMetrics ):
639
- return cls (list (metrics .gc_content ()))
674
+ gc_content = list (metrics .gc_content ())
675
+ smoothened_gc_content = []
676
+ gc_content_iter = iter (gc_content )
677
+ for i in range (50 ):
678
+ smoothened_gc_content .append (next (gc_content_iter ) + next (gc_content_iter ))
679
+ # Append the last 100% category.
680
+ smoothened_gc_content .append (next (gc_content_iter ))
681
+ return cls (gc_content , smoothened_gc_content )
640
682
641
683
642
684
@dataclasses .dataclass
0 commit comments