Skip to content

Commit e1867c2

Browse files
authored
Improved singleton counting (#170)
* fix name * keep stderr open for common convention * add useful script * update how singletons are counted * fix the last missing bc * alphanumeric check
1 parent 2b29a56 commit e1867c2

File tree

7 files changed

+98
-12
lines changed

7 files changed

+98
-12
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -831,7 +831,7 @@ jobs:
831831
- name: harpy downsample bam
832832
shell: micromamba-shell {0}
833833
run: harpy downsample -d 1 --random-seed 699 --quiet test/bam/sample1.bam
834-
- name: harpy downsample bam
834+
- name: harpy downsample fastq
835835
shell: micromamba-shell {0}
836836
run: harpy downsample -d 1 --quiet test/fastq/sample1.*
837837
- name: harpy hpc

harpy/bin/bx_stats.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,13 @@ def writestats(x, writechrom, destination):
141141
}
142142
else:
143143
# update the basic alignment info of the molecule
144+
if read.is_forward:
145+
# +1 for a forward read, whether it is paired or not
146+
d[mi]["n"] += 1
147+
elif read.is_reverse and not read.is_paired:
148+
# +1 for reverse only if it's unpaired, so the paired read doesn't count twice
149+
d[mi]["n"] += 1
144150
d[mi]["bp"] += bp
145-
d[mi]["n"] += 1
146151
d[mi]["insert_len"] += isize
147152
d[mi]["start"] = min(pos_start, d[mi]["start"])
148153
d[mi]["end"] = max(pos_end, d[mi]["end"])

harpy/bin/separate_singletons

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#! /usr/bin/env python
2+
3+
import os
4+
import re
5+
import sys
6+
import argparse
7+
import subprocess
8+
import pysam
9+
10+
parser = argparse.ArgumentParser(
11+
prog='separate_singletons',
12+
description='Isolate singleton and non-singleton linked-read BAM records into separate files.',
13+
usage = "separate_singletons -t threads -b barcode_tag -s singletons.bam input.bam > output.bam",
14+
)
15+
parser.add_argument("-b", dest = "bx_tag", metavar = "barcode_tag", type=str, default = "BX", help="The header tag with the barcode (default: %(default)s)")
16+
parser.add_argument("-s", dest = "singletons", metavar = "singletons_file", type=str, default = "singletons.bam", help="Name of output singleton file (default: %(default)s)")
17+
parser.add_argument("-t", dest = "threads", metavar="threads", type=int, default = 4, help="Number of threads to use (default: %(default)s)")
18+
parser.add_argument('input', type = str, help = "Input bam file")
19+
if len(sys.argv) == 1:
20+
parser.print_help(sys.stderr)
21+
sys.exit(1)
22+
23+
args = parser.parse_args()
24+
if args.threads <1:
25+
parser.error("Threads supplied to -t ({args.threads}) must be positive (e.g. >1)")
26+
if not os.path.exists(args.input):
27+
parser.error(f"{args.input} was not found")
28+
if len(args.bx_tag) != 2 or args.bx_tag.isalnum():
29+
parser.error(f"The header tag supplied to -b ({args.bx_tag}) must be alphanumeric and exactly two characters long")
30+
31+
invalid_pattern = re.compile(r'[AaBbCcDd]00')
32+
sorted_bam = f"{args.input[:-4]}.bxsort.bam"
33+
subprocess.run(f"samtools sort -@ {args.threads} -o {sorted_bam} -t {args.bx_tag} {args.input}".split(), stderr=sys.stderr)
34+
with (
35+
pysam.AlignmentFile(sorted_bam, "rb", check_sq=False) as infile,
36+
pysam.AlignmentFile(sys.stdout, "wb", template=infile) as nonsingleton,
37+
pysam.AlignmentFile(args.singletons, "wb", template=infile) as singleton,
38+
):
39+
record_store = []
40+
read_count = 0
41+
last_barcode = None
42+
for record in infile:
43+
try:
44+
barcode = record.get_tag(args.bx_tag)
45+
if isinstance(barcode, int):
46+
pass # an int from an MI-tharype tag
47+
elif invalid_pattern.search(barcode):
48+
continue
49+
except KeyError:
50+
continue
51+
# write the stored records when the barcode changes
52+
if last_barcode and barcode != last_barcode:
53+
if read_count > 1:
54+
[nonsingleton.write(i) for i in record_store]
55+
else:
56+
[singleton.write(i) for i in record_store]
57+
# reset the record store and read count
58+
record_store = []
59+
read_count = 0
60+
61+
record_store.append(record)
62+
if record.is_forward:
63+
# +1 for a forward read, whether it is paired or not
64+
read_count += 1
65+
elif record.is_reverse and not record.is_paired:
66+
# +1 for reverse only if it's unpaired, so the paired read doesn't count twice
67+
read_count += 1
68+
# update the last barcode with the current one
69+
last_barcode = barcode
70+
# After the for loop ends
71+
if record_store:
72+
if read_count > 1:
73+
for i in record_store:
74+
nonsingleton.write(i)
75+
else:
76+
for i in record_store:
77+
singleton.write(i)
78+
79+
# final housekeeping to remove intermediate
80+
os.remove(sorted_bam)

harpy/bin/separate_validbx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#! /usr/bin/env bash
22

33
if [[ -z "$1" ]]; then
4-
echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes (stderr)."
5-
echo -e "\n [usage] separate_validbx input.bam > valid.bam 2> invalid.bam"
4+
echo -e "\n Split a BAM file with BX:Z tags into 2 files, one with valid ACBD barcodes (stdout), one with invalid ACBD barcodes."
5+
echo -e "\n [usage] separate_validbx invalid.bam input.bam > valid.bam"
66
exit
77
fi
88

9-
samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput /dev/stderr $1
9+
samtools view -e '[BX]!~"[ABCD]0{2,4}"' --unoutput $1 $2

harpy/downsample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def downsample(input, prefix, downsample, invalid, bx_tag, random_seed, threads,
4747
- `drop`: don't output any invalid/missing barcodes
4848
"""
4949
# validate input files as either 1 bam or 2 fastq
50-
if len(bx_tag) != 2:
51-
raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be exactly 2 characters, e.g. "BX"')
50+
if len(bx_tag) != 2 or not bx_tag.isalnum():
51+
raise click.BadParameter(f'\'{bx_tag}\' is not a valid SAM tag. Tags for --bx-tag must be alphanumeric and exactly 2 characters, e.g. "BX"')
5252
if len(input) > 2:
5353
raise click.BadParameter('inputs must be 1 BAM file or 2 FASTQ files.')
5454
if len(input) == 1:

harpy/reports/align_bxstats.Rmd

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ process_input <- function(infile){
5555
tb[tb$valid != "invalidBX", "valid"] <- "validBX"
5656
tb$valid <- gsub("BX", " BX", tb$valid)
5757
# isolate non-singletons b/c molecules with 1 read pair aren't linked reads
58-
multiplex_df <- filter(tb, valid == "valid BX", reads > 2)
59-
singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX")
58+
multiplex_df <- filter(tb, valid == "valid BX", reads >= 2)
59+
singletons <- sum(tb$reads < 2 & tb$valid == "valid BX")
6060
tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+")
6161
tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer()
6262
tot_mol <- sum(tb$valid == "valid BX")
@@ -116,7 +116,8 @@ if(nrow(aggregate_df) == 0){
116116
This report aggregates the barcode-specific information from the alignments
117117
that were created using `harpy align`. Detailed information for any one sample
118118
can be found in that sample's individual report. The table below is an aggregation
119-
of data for each sample based on their `*.bxstats.gz` file.
119+
of data for each sample based on their `*.bxstats.gz` file. Every column after `% valid bx`
120+
ignores singletons in its calculations.
120121

121122
- `avg` refers to the average (arithmetic mean)
122123
- `SEM` refers to the Standard Error of the mean

harpy/reports/align_stats.Rmd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer()
7777
tot_valid <- sum(valids$reads)
7878
tot_invalid <- sum(invalids$reads)
7979
80-
non_singletons <- valids[valids$reads >2, ]
80+
non_singletons <- valids[valids$reads >= 2, ]
8181
n_non_singleton_mol <- nrow(non_singletons)
8282
```
8383

@@ -131,7 +131,7 @@ valueBox(scales::comma(tot_invalid), caption = "Invalid BX Records", color = "wa
131131
### singletons
132132
```{r valuebox_singletons}
133133
if (VALID_PRESENT){
134-
valueBox(round(sum(valids$reads <= 2)/nrow(valids), 2), caption = "% Singletons")
134+
valueBox(round(sum(valids$reads < 2)/nrow(valids), 2), caption = "% Singletons")
135135
} else {
136136
valueBox("NA", caption = "% Singletons")
137137
}

0 commit comments

Comments
 (0)