Skip to content

Commit

Permalink
Merge branch 'dev' into FelixKrueger-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixKrueger authored Aug 23, 2023
2 parents e16bb23 + 0edf8f0 commit 8c5fe8e
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 7 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

- removed an `exit 0` that would terminate runs after processing a single (set of) input file(s).

### deduplicate_bismark

- Changed the path to Samtools to custom variable ([#609](https://github.com/FelixKrueger/Bismark/issues/609))


## Changelog for Bismark v0.24.1 (release on 29 May 2023)

- Added new [documentation website](http://felixkrueger.github.io/Bismark/), built using [Material for Mkdocs](https://squidfunk.github.io/mkdocs-material/). Thanks to @ewels for a great (late-night) effort to break up and restructure what had become a fairly unwieldy monolithic beast
Expand Down
12 changes: 6 additions & 6 deletions bismark
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use lib "$RealBin/../lib";

my $parent_dir = getcwd();

my $bismark_version = 'v0.24.1';
my $bismark_version = 'v0.24.1dev';
my $copyright_dates = "2010-23";

my $start_run = time();
Expand Down Expand Up @@ -921,7 +921,7 @@ foreach my $filename (@filenames){
warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n";
}
}
exit 0;
# exit 0; # will terminate after a single supplied file....
}
else{
# If multiple files were supplied as the command line, like so:
Expand Down Expand Up @@ -7963,9 +7963,9 @@ VERSION

### BOWTIE 2/HISAT2 PARALLELIZATION OPTIONS
if ($parallel){
die "Please select a value for -p of 2 or more (ideally not higher than 4 though...)!\n" unless ($parallel > 1);
die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
if ($parallel > 4){
warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2);
warn "Attention: early reports suggested that high values of -p to have diminishing returns. Please test different values using a small subset of data for your hardware setting.\n"; sleep(1);
}
push @aligner_options,"-p $parallel";
push @aligner_options,'--reorder'; ## re-orders the Bowtie 2 or HISAT2 output so that it does match the input files. This is abolutely required for parallelization to work.
Expand Down Expand Up @@ -9916,7 +9916,7 @@ Bowtie 2/ HISAT2 parallelization options:
and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
by a few hundred megabytes. This option is only available if Bowtie 2 is linked with the pthreads
library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
automatically use the option '--reorder', which guarantees that output SAM records are printed in
an order corresponding to the order of the reads in the original input file, even when -p is set
Expand Down Expand Up @@ -9994,6 +9994,6 @@ Bismark BAM/SAM OUTPUT (default):
Each read of paired-end alignments is written out in a separate line in the above format.
Last modified on 28 May 2023
Last modified on 23 August 2023
HOW_TO
}
2 changes: 1 addition & 1 deletion deduplicate_bismark
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ foreach my $file (@filenames){
if ($multiple){
# we are assuming that all files are either in BAM format
if ($file =~ /\.bam$/){
open (IN,"$samtools_path cat -h $file @filenames | samtools view -h |") or die "Unable to read from BAM files in '@filenames': $!\n";
open (IN,"$samtools_path cat -h $file @filenames | $samtools_path view -h |") or die "Unable to read from BAM files in '@filenames': $!\n";
}
# or all in SAM format
else{ # if the reads are in SAM format we simply concatenate them
Expand Down
78 changes: 78 additions & 0 deletions merge_arbitrary_coverage_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python

import os
import glob
import time
import gzip
import argparse
import sys

def merge_coverage_files(basename):
print(f"Merging Bismark coverage files into a file called '{basename}.cov.gz'")
cov_files = glob.glob("*.cov.gz")

if not cov_files:
print("Error: No files ending in '.cov.gz' found in the current folder.")
sys.exit(1)

allcov = {} # overarching dictionary

for file in cov_files:
print(f"Reading methylation calls from file: {file}")

isGzip = False
if file.endswith("gz"):
infile = gzip.open(file, 'rt') # mode is 'rt' for text mode
isGzip = True
else:
infile = open(file)

for line in infile:

if isGzip:
line = line.rstrip() # no need to decode if using 'rt' mode
else:
line = line.rstrip()

chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0, 1, 4, 5)] # list comprehension

if chrom in allcov.keys():
pass
else:
allcov[chrom] = {}

pos = int(pos)

if pos in allcov[chrom].keys():
pass
else:
allcov[chrom][pos] = {}
allcov[chrom][pos]["meth"] = 0
allcov[chrom][pos]["unmeth"] = 0

allcov[chrom][pos]["meth"] += int(m)
allcov[chrom][pos]["unmeth"] += int(u)

infile.close()

print("Now printing out a new, merged coverage file")

with gzip.open(f"{basename}.cov.gz", "wt") as out:
for chrom in sorted(allcov.keys()):
for pos in sorted(allcov[chrom].keys()):
perc = ''
if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0):
print("Both methylated and unmethylated positions were 0. Exiting...")
sys.exit()
else:
perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100

out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n")

print(f"All done! The merged coverage file '{basename}.cov.gz' has been created.\n")

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Merge Bismark coverage files into a file called "basename.cov.gz".')
parser.add_argument('-b', '--basename', default='merged_coverage_file', help='The basename for the merged coverage file.')
args = parser.parse_args()
merge_coverage_files(args.basename)
127 changes: 127 additions & 0 deletions merge_coverage_files_ARGV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python

import os
import glob
import time
import gzip
import sys
import argparse

print ("Merging Bismark coverage files given as command line arguments")
# cov_files = glob.glob("*.cov.gz")

# if (options.verbose):
# print_options(options)




def parse_options():
parser = argparse.ArgumentParser(description="Set base-name to write merged Bismark coverage files to")
parser.add_argument("-b","--basename", type=str, help="Basename of file to write merged coverage output to (default 'merged_coverage_file')", default="merged_coverage_file.cov.gz")
parser.add_argument("filenames", help="The Bismark coverage files to merge", nargs='+')

options = parser.parse_args()
return options

def main():

options = parse_options()
print (f"Using the following basename: {options.basename}")
cov_files = options.filenames
print (f"Meging the following files: {cov_files}")
allcov = {} # overarching dictionary

for filename in cov_files:
print (f"Reading methylation calls from file: {filename}")

isGzip = False
if filename.endswith("gz"):
infile = gzip.open(filename) # mode is 'rb'
isGzip = True
else:
infile = open(filename)

for line in infile:

if isGzip:
line = line.decode().rstrip() # need to decode from a Binary to Str object
else:
line = line.rstrip()

# print(line)
# chrom, pos, m, u = line.split(sep="\t")[0,1,4,5]

chrom, pos, m, u = [line.split(sep="\t")[i] for i in (0,1,4,5)] # list comprehension

if chrom in allcov.keys():
pass
# print (f"already present: {chrom}")
else:
allcov[chrom] = {}

# print (f"Key not present yet for chromosome: {chrom}. Creating now")
# converting the position to int() right here
pos = int(pos)

if pos in allcov[chrom].keys():
# print (f"Positions was already present: chr: {chrom}, pos: {pos}")
pass
else:
allcov[chrom][pos] = {}
allcov[chrom][pos]["meth"] = 0
allcov[chrom][pos]["unmeth"] = 0

allcov[chrom][pos]["meth"] += int(m)
allcov[chrom][pos]["unmeth"] += int(u)

# print (f"Chrom: {chrom}")
# print (f"Chrom: {chrom}, Pos: {pos}, Meth: {m}, Unmeth: {u}")
# time.sleep(0.1)

infile.close()





# resetting
del chrom
del pos
outfile = f"{options.basename}.merged.cov.gz"

print (f"Now printing out a new, merged coverage file to >>{outfile}<<")

with gzip.open(outfile,"wt") as out:
# Just sorting by key will sort lexicographically, which is unintended.
# At least not for the positions
for chrom in sorted(allcov.keys()):

# Option I: This is a solution using {} dictionary comprehension. Seems to work.
# print (f"Converting position keys to int first for chromosome {chrom}")
# allcov[chrom] = {int(k) : v for k, v in allcov[chrom].items()}

# Option II: Another option could be to use a Lambda function to make the keys integers on the fly
# print (f"Attempting solution using a Lambda function on the positions for chrom:{chrom}")
# for pos in sorted(allcov[chrom].keys(), key = lambda x: int(x)):
# This also works

# Option III: Since I changed the values going into the positions dictionary, sorted()
# will now automatically perform a numerical sort. So no further action is required.
print (f"Now sorting positions on chromosome: {chrom}")
for pos in sorted(allcov[chrom].keys()):
perc = ''
if (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth'] == 0):
# This should not happen in real data, as coverage files by definition only show covered positions
print ("Both methylated and unmethylated positions were 0. Dying now...")
sys.exit()
else:
perc = allcov[chrom][pos]['meth'] / (allcov[chrom][pos]['meth'] + allcov[chrom][pos]['unmeth']) * 100

# percentage is displayed with 2 decimals with f-string formatting
out.write(f"{chrom}\t{pos}\t{pos}\t{perc:.2f}\t{allcov[chrom][pos]['meth']}\t{allcov[chrom][pos]['unmeth']}\n")

print ("All done, enjoy the merged coverage file!\n")

if __name__ == '__main__':
main()

0 comments on commit 8c5fe8e

Please sign in to comment.