Skip to content

Commit 8c635f0

Browse files
committed
merge gencov and align reports
1 parent 72826e7 commit 8c635f0

File tree

7 files changed

+115
-922
lines changed

7 files changed

+115
-922
lines changed

src/harpy/align.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@ def bwa(input, output_dir, genome, threads, extra_params, quality_filter, molecu
5555
fetch_rule(workflowdir, "align-bwa.smk")
5656
fetch_script(workflowdir, "assignMI.py")
5757
fetch_script(workflowdir, "bxStats.py")
58-
for i in ["BxStats", "Gencov"]:
59-
fetch_report(workflowdir, f"{i}.Rmd")
58+
fetch_report(workflowdir, "AlignStats.Rmd")
6059

6160
with open(f"{workflowdir}/config.yml", "w") as config:
6261
config.write(f"genomefile: {genome}\n")
@@ -138,7 +137,7 @@ def ema(input, output_dir, platform, whitelist, genome, threads, ema_bins, skipr
138137
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
139138
fetch_rule(workflowdir, "align-ema.smk")
140139
fetch_script(workflowdir, "bxStats.py")
141-
for i in ["EmaCount", "EmaGencov", "BxStats"]:
140+
for i in ["EmaCount", "AlignStats"]:
142141
fetch_report(workflowdir, f"{i}.Rmd")
143142

144143
with open(f"{workflowdir}/config.yml", "w") as config:
@@ -209,8 +208,7 @@ def minimap(input, output_dir, genome, threads, extra_params, quality_filter, mo
209208
fetch_rule(workflowdir, "align-minimap.smk")
210209
fetch_script(workflowdir, "assignMI.py")
211210
fetch_script(workflowdir, "bxStats.py")
212-
for i in ["BxStats", "Gencov"]:
213-
fetch_report(workflowdir, f"{i}.Rmd")
211+
fetch_report(workflowdir, "AlignStats.Rmd")
214212

215213
with open(f"{workflowdir}/config.yml", "w") as config:
216214
config.write(f"genomefile: {genome}\n")

src/harpy/reports/BxStats.Rmd renamed to src/harpy/reports/AlignStats.Rmd

Lines changed: 70 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
title: "Haplotag Molecule Report"
2+
title: "Harpy Alignment Report"
33
date: "`r format(Sys.time(), '%m-%d-%y %X')`"
44
output:
55
flexdashboard::flex_dashboard:
@@ -95,12 +95,12 @@ valueBox(scales::comma(totuniqBX), caption = "Total unique molecules", color = "
9595
```
9696

9797
## N50 and N90
98-
### Molecule Length Metrics
99-
```{r echo = FALSE, message = FALSE, warning = FALSE, out.width = '70%'}
98+
### Molecule NXX Length Metrics
99+
```{r echo = FALSE, message = FALSE, warning = FALSE}
100100
valids %>%
101101
group_by(contig) %>%
102-
summarize(n50 = NX(length_inferred, 50), n75 = NX(length_inferred, 75), n90 = NX(length_inferred, 90)) %>%
103-
as.data.frame() %>% knitr::kable()
102+
summarize(n50 = NX(length_inferred, 50), n75 = NX(length_inferred, 75), n90 = NX(length_inferred, 90)) %>%
103+
DT::datatable(rownames = F, options = list(dom = 'Brtip', buttons = c('csv')), fillContainer = T)
104104
```
105105

106106
## Reads per molecule dec
@@ -125,34 +125,32 @@ hs <- hist(
125125
breaks = min(valids$reads):max(valids$reads),
126126
plot = F
127127
)
128-
df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts) *100, 2))
128+
hs$counts <- round(hs$counts / sum(hs$counts) * 100, 2)
129+
hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
129130
130-
hchart(density(valids$reads), color = "#8484bd", name = "density") |>
131+
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "% of molecules") |>
131132
hc_title(text = "Reads Per Molecule") |>
132133
hc_xAxis(title = list(text = "reads per molecule")) |>
133-
hc_yAxis(title = list(text = "density")) |>
134+
hc_yAxis(title = list(text = "% molecules")) |>
134135
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
135136
hc_tooltip(crosshairs = TRUE) |>
136-
hc_exporting(enabled = T, filename = paste0(samplename, ".readsper")
137+
hc_exporting(enabled = T, filename = paste0(samplename, ".readsper"))
137138
138139
```
139140

140141
### bases per {.no-title}
141142
```{r basesper, echo = FALSE, message = FALSE, warning = FALSE, out.width="100%"}
142-
#hs <- hist(
143-
# round(valids$aligned_bp, -2),
144-
# breaks = seq(max(1, valids$aligned_bp), round(max(valids$aligned_bp + 150, -2)), by = 200),
145-
# plot = F
146-
#)
147-
#df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100,2))
148-
149-
hchart(density(valids$aligned_bp), color = "#75b89e", name = "density") |>
143+
hs <- hist(round(valids$aligned_bp, -2), breaks = 50, plot = F)
144+
hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
145+
hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
146+
147+
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#75b89e", name = "% of molecules") |>
150148
hc_title(text = "Bases Aligned Per Molecule") |>
151149
hc_xAxis(title = list(text = "aligned bases per molecule")) |>
152-
hc_yAxis(title = list(text = "density")) |>
150+
hc_yAxis(title = list(text = "% molecules")) |>
153151
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
154152
hc_tooltip(crosshairs = TRUE) |>
155-
hc_exporting(enabled = T, filename = paste0(samplename, ".basesper")
153+
hc_exporting(enabled = T, filename = paste0(samplename, ".basesper"))
156154
```
157155

158156

@@ -179,18 +177,20 @@ appear in the alignment data.
179177
```{r inferred, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'}
180178
hs <- hist(
181179
round(valids$length_inferred / 1000,0),
182-
breaks = seq(min(round(valids$length_inferred/1000 - 5, 0)), round(max(valids$length_inferred/1000 + 5, 0)), by = 5),
180+
breaks = 25,
183181
plot = F
184182
)
185-
df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100, 2))
186-
hchart(df, type = "spline", hcaes(x = var, y = freq), color = "#b3519d", name = "% of molecules") |>
183+
hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
184+
hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
185+
186+
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#b3519d", name = "% of molecules") |>
187187
hc_title(text = "Inferred Molecule Length") |>
188188
hc_subtitle(text = "lengths reported as kilobases (kbp)") |>
189189
hc_xAxis(title = list(text = "Inferred Molecule length (kbp)"), type = "logarithmic") |>
190190
hc_yAxis(title = list(text = "% of molecules")) |>
191191
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
192192
hc_tooltip(crosshairs = TRUE) |>
193-
hc_exporting(enabled = T, filename = paste0(samplename, ".mollen")
193+
hc_exporting(enabled = T, filename = paste0(samplename, ".mollen"))
194194
```
195195

196196
### inferred_covplot {.no-title}
@@ -200,22 +200,22 @@ hs <- hist(
200200
breaks = seq(0, round(max(valids$percent_coverage + 1, 0)), by = 1),
201201
plot = F
202202
)
203-
df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100,2))
203+
hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
204+
hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
204205
205-
hchart(df, type = "spline", hcaes(x = var, y = freq), color = "#e59765", name = "% of molecules") |>
206+
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#e59765", name = "% of molecules") |>
206207
hc_title(text = "Percent Molecule Coverage") |>
207208
hc_xAxis(title = list(text = "% molecule covered")) |>
208209
hc_yAxis(title = list(text = "% of molecules")) |>
209210
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
210211
hc_tooltip(crosshairs = TRUE) |>
211-
hc_exporting(enabled = T, filename = paste0(samplename, ".molcov")
212+
hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"))
212213
```
213214

214-
## Interpreting the supporting data
215+
## Interpreting the supporting data {.data-height=50}
215216
### interp desc {.no-title}
216217
<h2> Interpreting the Data </h2>
217-
Below are details on how to interpret the information presented in this report, as well as the
218-
underlying data used to create this report.
218+
These descriptions should help you understand the underlying data.
219219

220220
## inttable
221221
### interpreting {.no-title}
@@ -230,7 +230,7 @@ and that the sequences aligned to the same contig.
230230
```{r cols_explained, echo=FALSE, message=FALSE, warnings=FALSE}
231231
knitr::kable(
232232
data.frame(
233-
"Column Name" = c("contig", "molecule", "reads", "start", "end", "length_inferred", "percent_coverage", "aligned_bp", "mindist"),
233+
"Column Name" = c("contig", "molecule", "reads", "start", "end", "length_inferred", "percent_coverage", "aligned_bp"),
234234
"Description" = c(
235235
"name of the contig the molecule occurs on",
236236
"the molecule name as given by the MI:i: tag",
@@ -239,43 +239,28 @@ knitr::kable(
239239
"the end position of the last alignment for that molecule",
240240
"inferred length of the molecule based on the start/end of the alignments sharing the same barcode",
241241
"what percent of the molecule is represented by sequence alignments",
242-
"total number of base pairs aligned for that molecule",
243-
"the minimum basepair distance between two alignments sharing a barcode (excluding read pairs, kind of a sanity check)"
242+
"total number of base pairs aligned for that molecule"
244243
)
245244
)
246245
)
247246
```
248247

249-
## Interpreting this report
250248
### Barcode validity {.no-title}
251249
<h3> Interpreting Barcode Validity </h3>
252-
BX barcode validity is classified into one of three categories:
250+
BX barcode validity is classified into one of two categories:
253251

254252
```{r bx_explanation, echo=FALSE, message=FALSE, warnings=FALSE}
255253
knitr::kable(
256254
data.frame(
257-
"Classification" = c("valid BX", "invalid BX", "no BX"),
255+
"Classification" = c("valid BX", "invalid BX"),
258256
"Description" = c(
259257
"a complete BX barcode was present in the read (i.e. no 00 for any segments)",
260-
"a barcode was present in the read, but it contained 00 in at least one of the barcode segments",
261-
"no barcode was present in the read"
258+
"a barcode was present in the read, but it contained 00 in at least one of the barcode segments"
262259
)
263260
)
264261
)
265262
```
266263

267-
### Molecule splitting {.no-title}
268-
<h3> Molecule Splitting, Explained </h3>
269-
270-
It's common for a barcode shared by reads not originating from the same molecule
271-
to reappear much further along a chromosome or across multiple chromosomes. The
272-
process that derives the data in this report separates those recurring barcodes
273-
as unique molecules when their distance is greater than a predetermined threshold.
274-
If aligned with `BWA` or `minimap2`, Harpy added a corresponding `MI:i:` (Molecular
275-
Identifier) tag that reflects splits given the molecule distance threshold you
276-
specified (`r mdist`). If aligned with `EMA`, the `EMA` software itself
277-
determines the splits and assigns the `MI:i` tag without user specification.
278-
279264

280265
# Coverage Stats
281266

@@ -292,8 +277,7 @@ q99 <- quantile(tb$depth, 0.99)
292277
```{r echo = FALSE, message = FALSE, warning = FALSE}
293278
global_avg <- mean(tb$depth)
294279
global_sd <- sd(tb$depth)
295-
zscores <- (tb$depth - global_avg) / global_sd
296-
tb$outlier <- zscores >= 3
280+
tb$outlier <- tb$depth > q99
297281
outliers <- tb[tb$outlier, -5]
298282
nonoutliers <- tb[!(tb$outlier), -5]
299283
contig_avg <- group_by(tb, contig) %>%
@@ -316,10 +300,10 @@ contig_avg_filt <- rbind(
316300
<h1> Alignment Coverage Statistics </h1>
317301

318302
This report contains information regarding the sequence alignment coverage
319-
and depth for the file **`r paste0(samplename, ".bam")`**. The term "filtered" here and
320-
elsewhere in this report refers to removing intervals whose depth is greater
321-
than 3 standard deviations above the mean depth. The filtering described is shown
322-
for diagnostic purposes and no filtering has been performed on the original alignment file.
303+
and depth for the file **`r paste0(samplename, ".bam")`**. The term `<Q99` here and
304+
elsewhere in this report refers to keeping intervals whose depth is below the 99th
305+
depth percentile (`r q99`). The Q99 described is shown for diagnostic purposes, the
306+
alignments above this depth were not removed from the file.
323307

324308
## General Information {data-height=100}
325309
### ncontigs
@@ -329,7 +313,7 @@ valueBox(scales::comma(length(unique(tb$contig))), caption = "Contigs", color =
329313

330314
### general-samples
331315
```{r}
332-
valueBox("10kbp", caption = "Intervals", color = "info")
316+
valueBox("50 kbp", caption = "Intervals", color = "info")
333317
```
334318
### glob-avg
335319
```{r}
@@ -343,17 +327,17 @@ valueBox(scales::comma(global_sd), caption = "Stdev depth", color = "info")
343327

344328
### filt-avg
345329
```{r}
346-
valueBox(scales::comma(global_avg_filt), caption = "Average depth (filtered)", color = "info")
330+
valueBox(scales::comma(global_avg_filt), caption = '↓Q99 avg', color = "info")
347331
```
348332

349333
### filt-sd
350334
```{r}
351-
valueBox(scales::comma(global_sd_filt), caption = "Stdev depth (filtered)", color = "info")
335+
valueBox(scales::comma(global_sd_filt), caption = '↓Q99 stdev', color = "info")
352336
```
353337

354338
### n-outliers
355339
```{r}
356-
valueBox(scales::comma(nrow(outliers)), caption = "Possible outlier regions", color = "warning")
340+
valueBox(scales::comma(nrow(outliers)), caption = "regions >Q99", color = "warning")
357341
```
358342

359343
## Distdesc header
@@ -366,34 +350,39 @@ values, which is **`r q99`** for these data.
366350
## distributionplot
367351
### distplot {.no-title}
368352
```{r echo=F, warning=F, message=F}
369-
hchart(density(tb$depth[tb$depth <= q99]), color = "#9393d2", name = "density", inactiveOtherPoints = TRUE) |>
353+
hs <- hist(tb$depth[tb$depth <= q99], breaks = 0:(q99+1), plot = F)
354+
hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
355+
hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
356+
hs <- hs[hs$val %% 2 == 0, ]
357+
358+
hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#9393d2", name = "% of molecules") |>
370359
hc_xAxis(ceiling = q99, title = list(text = "depth")) |>
371-
hc_yAxis(title = list(text = "density")) |>
360+
hc_yAxis(title = list(text = "% molecules")) |>
372361
hc_title(text = "Distribution of Alignment Depths") |>
373-
hc_exporting(enabled = T, filename = paste0(samplename, ".cov")
362+
hc_exporting(enabled = T, filename = paste0(samplename, ".cov"))
374363
```
375364

376365
## Sumheader
377366
### sumhead {.no-title}
378367
<h2> Coverage Summary Information </h2>
379-
Below are tables that summarize coverage information for 10kbp intervals.
368+
These tables will help you understand the sequence coverage of the sample.
380369

381370
## Tableheaders
382371
### Sumdesc {.no-title}
383372
The table below shows the global and per-contig average depth and standard
384-
deviation per 10kbp intervals **including** intervals whose depth is flagged
373+
deviation per 50kbp intervals **including** intervals whose depth is flagged
385374
an outlier in the data.
386375

387376

388377
### filtdesc {.no-title}
389378
The table below shows the global and per-contig average depth and standard
390-
deviation per 10kbp intervals, **excluding** intervals whose depth is flagged
379+
deviation per 50kbp intervals, **excluding** intervals whose depth is flagged
391380
an outlier in the data, as determined by being greater than 3 standard deviations
392381
above the mean depth. This should be a more accurate representation of read coverage.
393382

394383
### outlierdesc {.no-title}
395-
The table below shows the 10kbp intervals considered outliers, as determined by
396-
being greater than 3 standard deviations above the mean depth.
384+
The table below shows the 50kbp intervals considered outliers, as determined by
385+
having coverage greater than the 99th percentile (`r q99`).
397386

398387
## Summary information
399388
### Averages
@@ -405,7 +394,8 @@ DT::datatable(
405394
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE),
406395
colnames = c('Contig', 'Average Depth', 'Standard Deviation'),
407396
autoHideNavigation = T,
408-
fillContainer = T
397+
fillContainer = T,
398+
height = "fit-content"
409399
)
410400
```
411401

@@ -418,7 +408,8 @@ DT::datatable(
418408
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE),
419409
colnames = c('Contig', 'Average Depth', 'Standard Deviation'),
420410
autoHideNavigation = T,
421-
fillContainer = T
411+
fillContainer = T,
412+
height = "fit-content"
422413
)
423414
```
424415

@@ -439,21 +430,19 @@ DT::datatable(
439430
## Plotdesc {.no-title}
440431
### pltdsc {.no-title}
441432
<h2> Depth and Coverage Across the Genome </h2>
442-
Below are plots of the depth and coverage of alignments for this sample. Clicking
443-
on a plot will expand it to fill your browser window. Clicking it again will exit
444-
out of the zoomed view.
445-
446-
447-
## Alignment Desc
448-
### aligndesc {.no-title}
449433
Below is a circular plot summarizing the depth information across up to 30 of the largest contigs.
450434
For clarity, this visualization truncates coverage at the 99th percentile (`r q99`).
451435
Each bar represents the alignment depth at a 50kb genomic interval, that is, the
452436
number of reads that had a _proper_ alignment in the 50kb interval. "Proper" refers to a read
453437
not marked as a duplicate or flagged with the SAM `UNMAP`, `SECONDARY`, or `QCFAIL` flags.
454438
These values are derived by using `samtools bedcov -c`.
455439

456-
## Alignment Summary
440+
This plot allows yout to hover regions to view their coverage, pan by clicking and dragging,
441+
and zoom. In case you become unable to scroll up from the plot due to these interactive
442+
features, place your cursor over the navigation bar at the top of this report and you will
443+
be able to scroll the report instead of zooming on the plot.
444+
445+
## Alignment Summary {data-height=900}
457446
### Summary {.no-title}
458447
```{r echo = FALSE, message = FALSE, warning = FALSE}
459448
# Find the 30 largest contigs
@@ -470,7 +459,7 @@ if (nrow(contigs) > 30){
470459
}
471460
```
472461

473-
```{r fig.align='center', out.width= "100%"}
462+
```{r fig.align='center', out.width= "80%", out.height="900px"}
474463
genomeChr <- .contigs$size
475464
names(genomeChr) <- .contigs$contig
476465
genomeChr <- as.list(genomeChr)
@@ -504,8 +493,9 @@ BioCircos(
504493
genome = genomeChr,
505494
chrPad = 0.02,
506495
genomeTicksDisplay = F,
496+
BARMouseOverColor = "#1cff42",
507497
BARMouseOverTooltipsHtml05 = "Depth: ",
508-
genomeLabelDy = 0
498+
genomeLabelDy = 0,
499+
width = "100%"
509500
)
510-
511501
```

0 commit comments

Comments
 (0)