1
1
---
2
- title : " Haplotag Molecule Report"
2
+ title : " Harpy Alignment Report"
3
3
date : " `r format(Sys.time(), '%m-%d-%y %X')`"
4
4
output :
5
5
flexdashboard::flex_dashboard :
@@ -95,12 +95,12 @@ valueBox(scales::comma(totuniqBX), caption = "Total unique molecules", color = "
95
95
```
96
96
97
97
## N50 and N90
98
- ### Molecule Length Metrics
99
- ``` {r echo = FALSE, message = FALSE, warning = FALSE, out.width = '70%' }
98
+ ### Molecule NXX Length Metrics
99
+ ``` {r echo = FALSE, message = FALSE, warning = FALSE}
100
100
valids %>%
101
101
group_by(contig) %>%
102
- summarize(n50 = NX(length_inferred, 50), n75 = NX(length_inferred, 75), n90 = NX(length_inferred, 90)) %>%
103
- as.data.frame() %>% knitr::kable( )
102
+ summarize(n50 = NX(length_inferred, 50), n75 = NX(length_inferred, 75), n90 = NX(length_inferred, 90)) %>%
103
+ DT::datatable(rownames = F, options = list(dom = 'Brtip', buttons = c('csv')), fillContainer = T )
104
104
```
105
105
106
106
## Reads per molecule dec
@@ -125,34 +125,32 @@ hs <- hist(
125
125
breaks = min(valids$reads):max(valids$reads),
126
126
plot = F
127
127
)
128
- df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts) *100, 2))
128
+ hs$counts <- round(hs$counts / sum(hs$counts) * 100, 2)
129
+ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
129
130
130
- hchart(density(valids$reads ), color = "#8484bd", name = "density ") |>
131
+ hchart(hs, "areaspline", hcaes(x = val, y = freq ), color = "#8484bd", name = "% of molecules ") |>
131
132
hc_title(text = "Reads Per Molecule") |>
132
133
hc_xAxis(title = list(text = "reads per molecule")) |>
133
- hc_yAxis(title = list(text = "density ")) |>
134
+ hc_yAxis(title = list(text = "% molecules ")) |>
134
135
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
135
136
hc_tooltip(crosshairs = TRUE) |>
136
- hc_exporting(enabled = T, filename = paste0(samplename, ".readsper")
137
+ hc_exporting(enabled = T, filename = paste0(samplename, ".readsper"))
137
138
138
139
```
139
140
140
141
### bases per {.no-title}
141
142
``` {r basesper, echo = FALSE, message = FALSE, warning = FALSE, out.width="100%"}
142
- #hs <- hist(
143
- # round(valids$aligned_bp, -2),
144
- # breaks = seq(max(1, valids$aligned_bp), round(max(valids$aligned_bp + 150, -2)), by = 200),
145
- # plot = F
146
- #)
147
- #df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100,2))
148
-
149
- hchart(density(valids$aligned_bp), color = "#75b89e", name = "density") |>
143
+ hs <- hist(round(valids$aligned_bp, -2), breaks = 50, plot = F)
144
+ hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
145
+ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
146
+
147
+ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#75b89e", name = "% of molecules") |>
150
148
hc_title(text = "Bases Aligned Per Molecule") |>
151
149
hc_xAxis(title = list(text = "aligned bases per molecule")) |>
152
- hc_yAxis(title = list(text = "density ")) |>
150
+ hc_yAxis(title = list(text = "% molecules ")) |>
153
151
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
154
152
hc_tooltip(crosshairs = TRUE) |>
155
- hc_exporting(enabled = T, filename = paste0(samplename, ".basesper")
153
+ hc_exporting(enabled = T, filename = paste0(samplename, ".basesper"))
156
154
```
157
155
158
156
@@ -179,18 +177,20 @@ appear in the alignment data.
179
177
``` {r inferred, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'}
180
178
hs <- hist(
181
179
round(valids$length_inferred / 1000,0),
182
- breaks = seq(min(round(valids$length_inferred/1000 - 5, 0)), round(max(valids$length_inferred/1000 + 5, 0)), by = 5) ,
180
+ breaks = 25 ,
183
181
plot = F
184
182
)
185
- df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100, 2))
186
- hchart(df, type = "spline", hcaes(x = var, y = freq), color = "#b3519d", name = "% of molecules") |>
183
+ hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
184
+ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
185
+
186
+ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#b3519d", name = "% of molecules") |>
187
187
hc_title(text = "Inferred Molecule Length") |>
188
188
hc_subtitle(text = "lengths reported as kilobases (kbp)") |>
189
189
hc_xAxis(title = list(text = "Inferred Molecule length (kbp)"), type = "logarithmic") |>
190
190
hc_yAxis(title = list(text = "% of molecules")) |>
191
191
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
192
192
hc_tooltip(crosshairs = TRUE) |>
193
- hc_exporting(enabled = T, filename = paste0(samplename, ".mollen")
193
+ hc_exporting(enabled = T, filename = paste0(samplename, ".mollen"))
194
194
```
195
195
196
196
### inferred_covplot {.no-title}
@@ -200,22 +200,22 @@ hs <- hist(
200
200
breaks = seq(0, round(max(valids$percent_coverage + 1, 0)), by = 1),
201
201
plot = F
202
202
)
203
- df <- data.frame(var = hs$mids, freq = round(hs$counts / sum(hs$counts)*100,2))
203
+ hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
204
+ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
204
205
205
- hchart(df, type = "spline ", hcaes(x = var , y = freq), color = "#e59765", name = "% of molecules") |>
206
+ hchart(hs, "areaspline ", hcaes(x = val , y = freq), color = "#e59765", name = "% of molecules") |>
206
207
hc_title(text = "Percent Molecule Coverage") |>
207
208
hc_xAxis(title = list(text = "% molecule covered")) |>
208
209
hc_yAxis(title = list(text = "% of molecules")) |>
209
210
hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |>
210
211
hc_tooltip(crosshairs = TRUE) |>
211
- hc_exporting(enabled = T, filename = paste0(samplename, ".molcov")
212
+ hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"))
212
213
```
213
214
214
- ## Interpreting the supporting data
215
+ ## Interpreting the supporting data {.data-height=50}
215
216
### interp desc {.no-title}
216
217
<h2 > Interpreting the Data </h2 >
217
- Below are details on how to interpret the information presented in this report, as well as the
218
- underlying data used to create this report.
218
+ These descriptions should help you understand the underlying data.
219
219
220
220
## inttable
221
221
### interpreting {.no-title}
@@ -230,7 +230,7 @@ and that the sequences aligned to the same contig.
230
230
``` {r cols_explained, echo=FALSE, message=FALSE, warnings=FALSE}
231
231
knitr::kable(
232
232
data.frame(
233
- "Column Name" = c("contig", "molecule", "reads", "start", "end", "length_inferred", "percent_coverage", "aligned_bp", "mindist" ),
233
+ "Column Name" = c("contig", "molecule", "reads", "start", "end", "length_inferred", "percent_coverage", "aligned_bp"),
234
234
"Description" = c(
235
235
"name of the contig the molecule occurs on",
236
236
"the molecule name as given by the MI:i: tag",
@@ -239,43 +239,28 @@ knitr::kable(
239
239
"the end position of the last alignment for that molecule",
240
240
"inferred length of the molecule based on the start/end of the alignments sharing the same barcode",
241
241
"what percent of the molecule is represented by sequence alignments",
242
- "total number of base pairs aligned for that molecule",
243
- "the minimum basepair distance between two alignments sharing a barcode (excluding read pairs, kind of a sanity check)"
242
+ "total number of base pairs aligned for that molecule"
244
243
)
245
244
)
246
245
)
247
246
```
248
247
249
- ## Interpreting this report
250
248
### Barcode validity {.no-title}
251
249
<h3 > Interpreting Barcode Validity </h3 >
252
- BX barcode validity is classified into one of three categories:
250
+ BX barcode validity is classified into one of two categories:
253
251
254
252
``` {r bx_explanation, echo=FALSE, message=FALSE, warnings=FALSE}
255
253
knitr::kable(
256
254
data.frame(
257
- "Classification" = c("valid BX", "invalid BX", "no BX" ),
255
+ "Classification" = c("valid BX", "invalid BX"),
258
256
"Description" = c(
259
257
"a complete BX barcode was present in the read (i.e. no 00 for any segments)",
260
- "a barcode was present in the read, but it contained 00 in at least one of the barcode segments",
261
- "no barcode was present in the read"
258
+ "a barcode was present in the read, but it contained 00 in at least one of the barcode segments"
262
259
)
263
260
)
264
261
)
265
262
```
266
263
267
- ### Molecule splitting {.no-title}
268
- <h3 > Molecule Splitting, Explained </h3 >
269
-
270
- It's common for a barcode shared by reads not originating from the same molecule
271
- to reappear much further along a chromosome or across multiple chromosomes. The
272
- process that derives the data in this report separates those recurring barcodes
273
- as unique molecules when their distance is greater than a predetermined threshold.
274
- If aligned with ` BWA ` or ` minimap2 ` , Harpy added a corresponding ` MI:i: ` (Molecular
275
- Identifier) tag that reflects splits given the molecule distance threshold you
276
- specified (` r mdist ` ). If aligned with ` EMA ` , the ` EMA ` software itself
277
- determines the splits and assigns the ` MI:i ` tag without user specification.
278
-
279
264
280
265
# Coverage Stats
281
266
@@ -292,8 +277,7 @@ q99 <- quantile(tb$depth, 0.99)
292
277
``` {r echo = FALSE, message = FALSE, warning = FALSE}
293
278
global_avg <- mean(tb$depth)
294
279
global_sd <- sd(tb$depth)
295
- zscores <- (tb$depth - global_avg) / global_sd
296
- tb$outlier <- zscores >= 3
280
+ tb$outlier <- tb$depth > q99
297
281
outliers <- tb[tb$outlier, -5]
298
282
nonoutliers <- tb[!(tb$outlier), -5]
299
283
contig_avg <- group_by(tb, contig) %>%
@@ -316,10 +300,10 @@ contig_avg_filt <- rbind(
316
300
<h1 > Alignment Coverage Statistics </h1 >
317
301
318
302
This report contains information regarding the sequence alignment coverage
319
- and depth for the file ** ` r paste0(samplename, ".bam") ` ** . The term "filtered" here and
320
- elsewhere in this report refers to removing intervals whose depth is greater
321
- than 3 standard deviations above the mean depth . The filtering described is shown
322
- for diagnostic purposes and no filtering has been performed on the original alignment file.
303
+ and depth for the file ** ` r paste0(samplename, ".bam") ` ** . The term ` <Q99 ` here and
304
+ elsewhere in this report refers to keeping intervals whose depth is below the 99th
305
+ depth percentile ( ` r q99 ` ) . The Q99 described is shown for diagnostic purposes, the
306
+ alignments above this depth were not removed from the file.
323
307
324
308
## General Information {data-height=100}
325
309
### ncontigs
@@ -329,7 +313,7 @@ valueBox(scales::comma(length(unique(tb$contig))), caption = "Contigs", color =
329
313
330
314
### general-samples
331
315
``` {r}
332
- valueBox("10kbp ", caption = "Intervals", color = "info")
316
+ valueBox("50 kbp ", caption = "Intervals", color = "info")
333
317
```
334
318
### glob-avg
335
319
``` {r}
@@ -343,17 +327,17 @@ valueBox(scales::comma(global_sd), caption = "Stdev depth", color = "info")
343
327
344
328
### filt-avg
345
329
``` {r}
346
- valueBox(scales::comma(global_avg_filt), caption = "Average depth (filtered)" , color = "info")
330
+ valueBox(scales::comma(global_avg_filt), caption = '↓Q99 avg' , color = "info")
347
331
```
348
332
349
333
### filt-sd
350
334
``` {r}
351
- valueBox(scales::comma(global_sd_filt), caption = "Stdev depth (filtered)" , color = "info")
335
+ valueBox(scales::comma(global_sd_filt), caption = '↓Q99 stdev' , color = "info")
352
336
```
353
337
354
338
### n-outliers
355
339
``` {r}
356
- valueBox(scales::comma(nrow(outliers)), caption = "Possible outlier regions", color = "warning")
340
+ valueBox(scales::comma(nrow(outliers)), caption = "regions >Q99 ", color = "warning")
357
341
```
358
342
359
343
## Distdesc header
@@ -366,34 +350,39 @@ values, which is **`r q99`** for these data.
366
350
## distributionplot
367
351
### distplot {.no-title}
368
352
``` {r echo=F, warning=F, message=F}
369
- hchart(density(tb$depth[tb$depth <= q99]), color = "#9393d2", name = "density", inactiveOtherPoints = TRUE) |>
353
+ hs <- hist(tb$depth[tb$depth <= q99], breaks = 0:(q99+1), plot = F)
354
+ hs$counts <- round(hs$counts / sum(hs$counts)*100,2)
355
+ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts)
356
+ hs <- hs[hs$val %% 2 == 0, ]
357
+
358
+ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#9393d2", name = "% of molecules") |>
370
359
hc_xAxis(ceiling = q99, title = list(text = "depth")) |>
371
- hc_yAxis(title = list(text = "density ")) |>
360
+ hc_yAxis(title = list(text = "% molecules ")) |>
372
361
hc_title(text = "Distribution of Alignment Depths") |>
373
- hc_exporting(enabled = T, filename = paste0(samplename, ".cov")
362
+ hc_exporting(enabled = T, filename = paste0(samplename, ".cov"))
374
363
```
375
364
376
365
## Sumheader
377
366
### sumhead {.no-title}
378
367
<h2 > Coverage Summary Information </h2 >
379
- Below are tables that summarize coverage information for 10kbp intervals .
368
+ These tables will help you understand the sequence coverage of the sample .
380
369
381
370
## Tableheaders
382
371
### Sumdesc {.no-title}
383
372
The table below shows the global and per-contig average depth and standard
384
- deviation per 10kbp intervals ** including** intervals whose depth is flagged
373
+ deviation per 50kbp intervals ** including** intervals whose depth is flagged
385
374
an outlier in the data.
386
375
387
376
388
377
### filtdesc {.no-title}
389
378
The table below shows the global and per-contig average depth and standard
390
- deviation per 10kbp intervals, ** excluding** intervals whose depth is flagged
379
+ deviation per 50kbp intervals, ** excluding** intervals whose depth is flagged
391
380
an outlier in the data, as determined by being greater than 3 standard deviations
392
381
above the mean depth. This should be a more accurate representation of read coverage.
393
382
394
383
### outlierdesc {.no-title}
395
- The table below shows the 10kbp intervals considered outliers, as determined by
396
- being greater than 3 standard deviations above the mean depth .
384
+ The table below shows the 50kbp intervals considered outliers, as determined by
385
+ having coverage greater than the 99th percentile ( ` r q99 ` ) .
397
386
398
387
## Summary information
399
388
### Averages
@@ -405,7 +394,8 @@ DT::datatable(
405
394
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE),
406
395
colnames = c('Contig', 'Average Depth', 'Standard Deviation'),
407
396
autoHideNavigation = T,
408
- fillContainer = T
397
+ fillContainer = T,
398
+ height = "fit-content"
409
399
)
410
400
```
411
401
@@ -418,7 +408,8 @@ DT::datatable(
418
408
options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE),
419
409
colnames = c('Contig', 'Average Depth', 'Standard Deviation'),
420
410
autoHideNavigation = T,
421
- fillContainer = T
411
+ fillContainer = T,
412
+ height = "fit-content"
422
413
)
423
414
```
424
415
@@ -439,21 +430,19 @@ DT::datatable(
439
430
## Plotdesc {.no-title}
440
431
### pltdsc {.no-title}
441
432
<h2 > Depth and Coverage Across the Genome </h2 >
442
- Below are plots of the depth and coverage of alignments for this sample. Clicking
443
- on a plot will expand it to fill your browser window. Clicking it again will exit
444
- out of the zoomed view.
445
-
446
-
447
- ## Alignment Desc
448
- ### aligndesc {.no-title}
449
433
Below is a circular plot summarizing the depth information across up to 30 of the largest contigs.
450
434
For clarity, this visualization truncates coverage at the 99th percentile (` r q99 ` ).
451
435
Each bar represents the alignment depth at a 50kb genomic interval, that is, the
452
436
number of reads that had a _ proper_ alignment in the 50kb interval. "Proper" refers to a read
453
437
not marked as a duplicate or flagged with the SAM ` UNMAP ` , ` SECONDARY ` , or ` QCFAIL ` flags.
454
438
These values are derived by using ` samtools bedcov -c ` .
455
439
456
- ## Alignment Summary
440
+ This plot allows yout to hover regions to view their coverage, pan by clicking and dragging,
441
+ and zoom. In case you become unable to scroll up from the plot due to these interactive
442
+ features, place your cursor over the navigation bar at the top of this report and you will
443
+ be able to scroll the report instead of zooming on the plot.
444
+
445
+ ## Alignment Summary {data-height=900}
457
446
### Summary {.no-title}
458
447
``` {r echo = FALSE, message = FALSE, warning = FALSE}
459
448
# Find the 30 largest contigs
@@ -470,7 +459,7 @@ if (nrow(contigs) > 30){
470
459
}
471
460
```
472
461
473
- ``` {r fig.align='center', out.width= "100% "}
462
+ ``` {r fig.align='center', out.width= "80%", out.height="900px "}
474
463
genomeChr <- .contigs$size
475
464
names(genomeChr) <- .contigs$contig
476
465
genomeChr <- as.list(genomeChr)
@@ -504,8 +493,9 @@ BioCircos(
504
493
genome = genomeChr,
505
494
chrPad = 0.02,
506
495
genomeTicksDisplay = F,
496
+ BARMouseOverColor = "#1cff42",
507
497
BARMouseOverTooltipsHtml05 = "Depth: ",
508
- genomeLabelDy = 0
498
+ genomeLabelDy = 0,
499
+ width = "100%"
509
500
)
510
-
511
501
```
0 commit comments