Skip to content

Commit 313c84f

Browse files
authored
Merge pull request #46 from rhpvorderman/graphpolish
Graphpolish
2 parents 505e6dd + 7635ce5 commit 313c84f

File tree

1 file changed

+51
-9
lines changed

1 file changed

+51
-9
lines changed

src/sequali/report_modules.py

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,21 @@ def equidistant_ranges(length: int, parts: int) -> Iterator[Tuple[int, int]]:
8282
start = stop
8383

8484

85-
def logarithmic_ranges(length: int):
86-
# Use a scaling factor: this needs 300 units to reach the length of the
85+
def logarithmic_ranges(length: int, min_distance: int = 5):
86+
"""
87+
Gives a squashed logarithmic range. It is not truly logarithmic as the
88+
minimum distance ensures that the lower units are more tightly packed.
89+
"""
90+
# Use a scaling factor: this needs 400 units to reach the length of the
8791
# largest human chromosome. This will still fit on a graph once we reach
88-
# those sequencing sizes. By utilizing the same scaling factor, this
89-
# program will have more comparable plots between FASTQ files.
90-
scaling_factor = 250_000_000 ** (1 / 300)
92+
# those sequencing sizes.
93+
scaling_factor = 250_000_000 ** (1 / 400)
9194
i = 0
9295
start = 0
9396
while True:
9497
stop = round(scaling_factor ** i)
9598
i += 1
96-
if stop > start:
99+
if stop >= start + min_distance:
97100
yield start, stop
98101
start = stop
99102
if stop >= length:
@@ -469,9 +472,14 @@ class PerSequenceAverageQualityScores(ReportModule):
469472
x_labels: Tuple[str, ...] = tuple(str(x) for x in range(PHRED_MAX + 1))
470473

471474
def plot(self) -> str:
475+
maximum_score = 0
476+
for i, count in enumerate(self.average_quality_counts):
477+
if count > 0:
478+
maximum_score = i
479+
maximum_score = max(maximum_score + 2, 40)
472480
plot = pygal.Bar(
473481
title="Per sequence quality scores",
474-
x_labels=range(PHRED_MAX + 1),
482+
x_labels=range(maximum_score + 1),
475483
x_labels_major_every=3,
476484
show_minor_x_labels=False,
477485
style=ONE_SERIE_STYLE,
@@ -482,7 +490,8 @@ def plot(self) -> str:
482490
total = sum(self.average_quality_counts)
483491
percentage_scores = [100 * score / total
484492
for score in self.average_quality_counts]
485-
plot.add("", percentage_scores)
493+
494+
plot.add("", percentage_scores[:maximum_score])
486495
return plot.render(is_unicode=True)
487496

488497
def to_html(self) -> str:
@@ -612,7 +621,9 @@ def to_html(self) -> str:
612621
@dataclasses.dataclass
613622
class PerSequenceGCContent(ReportModule):
614623
gc_content_counts: Sequence[int]
624+
smoothened_gc_content_counts: Sequence[int]
615625
x_labels: Sequence[str] = tuple(str(x) for x in range(101))
626+
smoothened_x_labels: Sequence[str] = tuple(str(x) for x in range(0, 101, 2))
616627

617628
def plot(self):
618629
plot = pygal.Bar(
@@ -628,15 +639,46 @@ def plot(self):
628639
plot.add("", self.gc_content_counts)
629640
return plot.render(is_unicode=True)
630641

642+
def smoothened_plot(self):
643+
plot = pygal.Line(
644+
title="Per sequence GC content (smoothened)",
645+
x_labels=self.smoothened_x_labels,
646+
x_labels_major_every=3,
647+
show_minor_x_labels=False,
648+
x_title="GC %",
649+
y_title="number of reads",
650+
interpolate="cubic",
651+
style=ONE_SERIE_STYLE,
652+
**COMMON_GRAPH_OPTIONS,
653+
)
654+
plot.add("", self.smoothened_gc_content_counts)
655+
return plot.render(is_unicode=True)
656+
631657
def to_html(self) -> str:
632658
return f"""
633659
<h2>Per sequence GC content</h2>
660+
For short reads with fixed size (i.e. Illumina) the plot will
661+
look very spiky due to the GC content calculation: GC bases / all
662+
bases. For read lengths of 151, both 75 and 76 GC bases lead to a
663+
percentage of 50% (rounded) and 72 and 73 GC bases leads to 48%
664+
(rounded). Only 74 GC bases leads to 49%. As a result the
665+
even categories will be twice as high, which creates a spike. The
666+
smoothened plot is provided to give a clearer picture in this case.
667+
<br>
634668
{self.plot()}
669+
{self.smoothened_plot()}
635670
"""
636671

637672
@classmethod
638673
def from_qc_metrics(cls, metrics: QCMetrics):
639-
return cls(list(metrics.gc_content()))
674+
gc_content = list(metrics.gc_content())
675+
smoothened_gc_content = []
676+
gc_content_iter = iter(gc_content)
677+
for i in range(50):
678+
smoothened_gc_content.append(next(gc_content_iter) + next(gc_content_iter))
679+
# Append the last 100% category.
680+
smoothened_gc_content.append(next(gc_content_iter))
681+
return cls(gc_content, smoothened_gc_content)
640682

641683

642684
@dataclasses.dataclass

0 commit comments

Comments
 (0)