diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cfd67a5..fe66aab 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -61,4 +61,21 @@ jobs: ls -R tests - name: Run workflow run: | - snakemake --directory tests --configfile tests/config.yaml --profile tests/profile --snakefile workflow/Snakefile + ROOT=$(realpath .) + snakemake \ + --directory tests \ + --configfile tests/config.yaml \ + --profile tests/profile \ + --snakefile workflow/Snakefile \ + --singularity-args="--bind $ROOT --bind $HOME" + - name: Test reporting + run: | + ROOT=$(realpath .) + snakemake \ + --directory tests \ + --configfile tests/config.yaml \ + --profile tests/profile \ + --snakefile workflow/Snakefile \ + --report report.zip \ + --singularity-args="--bind $ROOT --bind $HOME" + diff --git a/.gitignore b/.gitignore index 256abfa..9ea447c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ **/.snakemake **/results +**/references **/.cache **/.conda **/.java @@ -7,4 +8,9 @@ **/.condarc **/workdir profile/slurm -tests4/ \ No newline at end of file +tests4/ +tests/.wget-hsts +tests/*.log +tests/*.out +run-test.sh +test.sh \ No newline at end of file diff --git a/tests/Dockerfile b/Dockerfile similarity index 100% rename from tests/Dockerfile rename to Dockerfile diff --git a/tests/profile/config.yaml b/tests/profile/config.yaml index e0c03fc..6369171 100644 --- a/tests/profile/config.yaml +++ b/tests/profile/config.yaml @@ -1,6 +1,5 @@ use-conda: True use-singularity: True -singularity-args: "" show-failed-logs: True cores: 2 conda-cleanup-pkgs: cache diff --git a/workflow/Snakefile b/workflow/Snakefile index 4b0df73..40bace9 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,6 +31,14 @@ configfile: "config/config.yaml" include: "rules/common.smk" +########################### +## DEFINE PIPELINE CONTAINER +########################### + + +containerized: "docker://ftabaro/rna-seq:1.0" + + ####################### ## DEFINE VARIABLES ####################### @@ -84,13 +92,7 @@ gtf_path = references_folder.joinpath( get_filename(config["genome"]["gtf_url"], decompress=True) ) -rmsk_path = references_folder.joinpath("rmsk.gtf") -rmsk_bed = Path(str(rmsk_path).replace("gtf", "bed")) - -# gaf_path = references_folder.joinpath( -# get_filename(config["genome"]["gaf_url"], decompress=False) -# ) - +rmsk_folder = references_folder.joinpath("rmsk") tRNA_annotation_dir = references_folder.joinpath("gtrnadb") ## Get samples diff --git a/workflow/rules/download-references.smk b/workflow/rules/download-references.smk index 9686dae..974b94b 100644 --- a/workflow/rules/download-references.smk +++ b/workflow/rules/download-references.smk @@ -3,6 +3,7 @@ rule download_genome_fasta_file: protected(fasta_path), params: url=config["genome"]["fasta_url"], + cache: True conda: "../env/wget.yml" log: @@ -18,6 +19,7 @@ rule download_genome_fasta_file: rule download_genome_annotation_file: output: protected(gtf_path), + cache: True params: url=config["genome"]["gtf_url"], conda: @@ -34,8 +36,12 @@ rule download_genome_annotation_file: rule download_repeatmasker_annotation_file: output: - protected(rmsk_path), - protected(rmsk_bed), + protected( + multiext( + str(rmsk_folder.joinpath(config["genome"]["label"])), ".gtf", ".bed" + ) + ), + cache: True params: genome_id=config["genome"]["label"], conda: @@ -66,6 +72,7 @@ rule download_gtRNAdb: "-tRNAs.fa", ) ), + cache: True params: url=config["genome"]["gtrnadb_url"], output_dir=tRNA_annotation_dir, diff --git a/workflow/rules/filter_bam.smk b/workflow/rules/filter_bam.smk index 819e3f4..bccfa86 100644 --- a/workflow/rules/filter_bam.smk +++ b/workflow/rules/filter_bam.smk @@ -1,7 +1,9 @@ rule filter_bam: input: alignment=starTE_folder.joinpath("{serie}/{method}/{sample}.Aligned.out.bam"), - annotation=rmsk_bed, + annotation=rmsk_folder.joinpath( + "{0}.{1}".format(config["genome"]["label"], "bed") + ), output: starTE_folder.joinpath("{serie}/filter/{method}/{sample}.TEonly.bam"), log: diff --git a/workflow/rules/star.smk b/workflow/rules/star.smk index c6ed5c0..ae58037 100644 --- a/workflow/rules/star.smk +++ b/workflow/rules/star.smk @@ -23,6 +23,7 @@ rule star_genome_preparation: genome_annotation_file=gtf_path, output: directory(references_folder.joinpath("STAR")), + cache: True conda: "../env/alignment.yml" threads: 8 @@ -52,15 +53,18 @@ rule star: star_index_folder=references_folder.joinpath("STAR"), genome_annotation_file=gtf_path, output: - star_folder.joinpath("{serie}/{sample}.Aligned.sortedByCoord.out.bam"), - star_folder.joinpath("{serie}/{sample}.Aligned.toTranscriptome.out.bam"), - star_folder.joinpath("{serie}/{sample}.ReadsPerGene.out.tab"), - star_folder.joinpath("{serie}/{sample}.SJ.out.tab"), - star_folder.joinpath("{serie}/{sample}.Signal.Unique.str1.out.wig"), - star_folder.joinpath("{serie}/{sample}.Signal.Unique.str2.out.wig"), - star_folder.joinpath("{serie}/{sample}.Signal.UniqueMultiple.str1.out.wig"), - star_folder.joinpath("{serie}/{sample}.Signal.UniqueMultiple.str2.out.wig"), - star_folder.joinpath("{serie}/{sample}.Log.final.out"), + multiext( + str(star_folder.joinpath("{serie}", "{sample}")), + ".Aligned.sortedByCoord.out.bam", + ".Aligned.toTranscriptome.out.bam", + ".ReadsPerGene.out.tab", + ".SJ.out.tab", + ".Signal.Unique.str1.out.wig", + ".Signal.Unique.str2.out.wig", + ".Signal.UniqueMultiple.str1.out.wig", + ".Signal.UniqueMultiple.str2.out.wig", + ".Log.final.out", + ), threads: 8 resources: runtime=lambda wildcards, attempt: 1440 * attempt, diff --git a/workflow/rules/starTE.smk b/workflow/rules/starTE.smk index 4646537..47bbd1c 100644 --- a/workflow/rules/starTE.smk +++ b/workflow/rules/starTE.smk @@ -68,7 +68,9 @@ rule featureCounts_random: else samples["paired"][wildcards.serie] ), ), - annotation=rmsk_path, + annotation=rmsk_folder.joinpath( + "{0}.{1}".format(config["genome"]["label"], "gtf") + ), output: starTE_folder.joinpath("{serie}/featureCount/random.txt"), conda: @@ -218,7 +220,9 @@ rule featureCounts_multihit: else samples["paired"][wildcards.serie] ), ), - annotation=rmsk_path, + annotation=rmsk_folder.joinpath( + "{0}.{1}".format(config["genome"]["label"], "gtf") + ), output: starTE_folder.joinpath("{serie}/featureCount/multihit.txt"), conda: diff --git a/workflow/scripts/download-gtf.sh b/workflow/scripts/download-gtf.sh index 9e11109..28ff9a6 100644 --- a/workflow/scripts/download-gtf.sh +++ b/workflow/scripts/download-gtf.sh @@ -4,31 +4,35 @@ set -e URL="${snakemake_params[url]}" +mkdir -p $(dirname ${snakemake_log}) +touch ${snakemake_log} + if [[ $URL == *.gz ]]; then TMP=$(mktemp -u --suffix .gz) + echo "$URL refers to gzipped file. Temp file: $TMP" >> ${snakemake_log} else TMP=$(mktemp -u) + echo "$URL does not refer to gzipped file. Temp file: $TMP" >> ${snakemake_log} fi -echo "Downloading to $TMP" | tee -a ${snakemake_log} +echo "Downloading to $TMP" >> ${snakemake_log} OUTPUT=${snakemake_output} mkdir -pv $(dirname $OUTPUT) -# wget -O $TMP "$URL" -curl "$URL" \ --H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0' \ --H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ --H 'Accept-Language: en-US,en;q=0.5' \ --H 'Accept-Encoding: gzip, deflate' \ --H 'Connection: keep-alive' \ --H 'Upgrade-Insecure-Requests: 1' \ --H 'DNT: 1' \ --H 'Sec-GPC: 1' \ --H 'Pragma: no-cache' \ --H 'Cache-Control: no-cache' \ ---silent \ ---output $TMP +wget "$URL" \ +--user-agent=' Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0' \ +--header='Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' \ +--header='Accept-Language: en-US,en;q=0.5' \ +--header='Accept-Encoding: gzip, deflate' \ +--header='Connection: keep-alive' \ +--header='Upgrade-Insecure-Requests: 1' \ +--header='DNT: 1' \ +--header='Sec-GPC: 1' \ +--header='Pragma: no-cache' \ +--header='Cache-Control: no-cache' \ +--quiet \ +--output-document="$TMP" sleep $(( $RANDOM % 10 + 2 )) @@ -37,10 +41,10 @@ if [[ $URL == *.gz ]] && [[ ! $OUTPUT == *.gz ]]; then sleep $(( $RANDOM % 10 + 2 )) fi -if grep -v '#' "${TMP%.gz}" | head -n 1 | grep -q '^chr' | tee -a ${snakemake_log}; then - echo "Mv'ing to $OUTPUT" | tee -a ${snakemake_log} +if grep -v '#' "${TMP%.gz}" | head -n 1 | grep -q '^chr' >> ${snakemake_log}; then + echo "Mv'ing to $OUTPUT" >> ${snakemake_log} mv $TMP $OUTPUT else - echo "Adding \"chr\" to first column, then move to $OUTPUT" | tee -a ${snakemake_log} + echo "Adding \"chr\" to first column, then move to $OUTPUT" >> ${snakemake_log} awk -F "\t" -v OFS="\t" '!/^#/{print "chr"$0}/#/{print}' ${TMP%.gz} > $OUTPUT fi diff --git a/workflow/scripts/edit_condition_file.py b/workflow/scripts/edit_condition_file.py index d394968..aa7ec76 100644 --- a/workflow/scripts/edit_condition_file.py +++ b/workflow/scripts/edit_condition_file.py @@ -28,9 +28,9 @@ joined = sample_sheet.join(salmon_condition_sheet) joined["condition"] = joined.apply( - lambda row: "control" - if row[snakemake.params.variable] == reference_level - else "treatment", + lambda row: ( + "control" if row[snakemake.params.variable] == reference_level else "treatment" + ), axis=1, ) joined = joined.reset_index() diff --git a/workflow/scripts/get_rmsk.py b/workflow/scripts/get_rmsk.py index 04b1c2e..47f426b 100644 --- a/workflow/scripts/get_rmsk.py +++ b/workflow/scripts/get_rmsk.py @@ -1,20 +1,81 @@ from urllib import request, parse +from urllib.error import HTTPError, URLError import json import csv import sys +import logging +from time import sleep +from random import randint -def main( - genome="mm10", track_name="rmsk", gtf_output=sys.stdout, bed_output=sys.stdout +def setup_logger(log_file): + + # Create a logger + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + + # Create a file handler and set the level to debug + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + + # Create a console handler and set the level to info + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + # Create a formatter and add it to the handlers + formatter = logging.Formatter( + "%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) + file_handler.setFormatter(formatter) + console_handler.setFormatter(formatter) + + # Add the handlers to the logger + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + return logger + + +def build_request(base_url, params, logger=setup_logger("download-rmsk.py")): + query_string = parse.urlencode(params).encode("ascii") + + url = request.Request( + url=base_url, + data=query_string, + method="GET", + ) + logger.debug("Created url: {0}".format(url.get_full_url())) + return url + + +def fetch_chromosomes( + genome="mm10", track="rmsk", logger=setup_logger("download-rmsk.log") ): """ - A function that queries UCSC genome browser to download RepeatMasker - annotation. + A function to fetch the list of chromosomes of a given organism from UCSC. """ - # setup query string - query_object = {"genome": genome, "track": track_name} - query_string = parse.urlencode(query_object).encode("ascii") + base_url = "https://api.genome.ucsc.edu/list/chromosomes" + + params = {"genome": genome, "track": track} + url = build_request(base_url, params) + + try: + with request.urlopen(url) as response: + data = json.loads(response.read().decode("utf-8")) + return data + except HTTPError as e: + logger.critical(f"HTTPError: {e.code} - {e.reason}") + raise e + except URLError as e: + logger.critical(f"URLError: {e.reason}") + raise e + + +def get_gtf_writer(gtf_output, logger): + """ + A function to build a GTF-compliant CSV writer. + """ # output field names gtf_fieldnames = [ @@ -29,83 +90,190 @@ def main( "attribute", ] - bed_fieldnames = ["genoName", "genoStart", "genoEnd", "swScore", "strand", "name"] - - # create request object - url = request.Request( - url="https://api.genome.ucsc.edu/getData/track", - data=query_string, - method="GET", - ) - # create DictWriter for later gtf_writer = csv.DictWriter( f=gtf_output, fieldnames=gtf_fieldnames, dialect="excel-tab", ) + logger.debug("Created GTF writer. Fields = {0}".format(gtf_fieldnames)) + + return gtf_writer + + +def get_gtf_row(data): + """ + A function that maps UCSC RepeatMasker repeat JSON to GTF fields + """ + # GTF is 1-based, therefore, genoStart needs +1 + return { + "genoName": data["genoName"], + "source": "RepeatMasker", + "feature": "exon", + "genoStart": int(data["genoStart"]) + 1, + "genoEnd": int(data["genoEnd"]), + "swScore": data["swScore"], + "strand": data["strand"], + "frame": ".", + "attribute": ";".join( + [ + f"{key}={data[key]}" + for key in filter( + lambda x: x + in [ + "repName", + "repClass", + "repFamily", + "repStart", + "repEnd", + "repLeft", + ], + data, + ) + ] + ), + } + + +def write_gtf_row(writer, data): + """ + A function that writes a single GTF row using the given GTF Writer and data + """ + row = get_gtf_row(data) + writer.writerow(row) + +def get_bed_writer(bed_output, logger): + """ + A function to build a BED-compliant CSV writer. + """ + bed_fieldnames = ["genoName", "genoStart", "genoEnd", "swScore", "strand", "name"] bed_writer = csv.DictWriter( f=bed_output, fieldnames=bed_fieldnames, dialect="excel-tab" ) + logger.debug("Created BED writer. Fields = {0}".format(bed_fieldnames)) + return bed_writer + + +def get_bed_row(data): + """ + A function that maps UCSC RepeatMasker repeat JSON to BED fields + """ + # BED is zero-based, therefore, genoStart does not need +1 + return { + "genoName": data["genoName"], + "genoStart": data["genoStart"], + "genoEnd": data["genoEnd"], + "swScore": data["swScore"], + "strand": data["strand"], + "name": data["repName"], + } + + +def write_bed_row(writer, data): + """ + A function that writes a single BED row using the given BED Writer and data + """ + row = get_bed_row(data) + writer.writerow(row) + + +def main( + genome="mm10", + track_name="rmsk", + gtf_output=sys.stdout, + bed_output=sys.stdout, + logger=setup_logger("download-rmsk.log"), +): + """ + A function that queries UCSC genome browser to download RepeatMasker + annotation. + """ + + gtf_writer = get_gtf_writer(gtf_output, logger) + bed_writer = get_bed_writer(bed_output, logger) + + chromosome_dict = fetch_chromosomes(genome, track_name, logger) - # perform the query and parse response - with request.urlopen(url) as response: - dat = json.loads(response.read()) - track_data = dat["rmsk"] - - for chromosome in track_data.values(): - for repeat in chromosome: - # UCSC works zero-based base position - # https://www.biostars.org/p/84686/ - - # GTF is 1-based, therefore, genoStart needs +1 - gtf_row = { - "genoName": repeat["genoName"], - "source": "RepeatMasker", - "feature": "exon", - "genoStart": int(repeat["genoStart"]) + 1, - "genoEnd": int(repeat["genoEnd"]), - "swScore": repeat["swScore"], - "strand": repeat["strand"], - "frame": ".", - "attribute": ";".join( - [ - f"{key}={repeat[key]}" - for key in filter( - lambda x: x - in [ - "repName", - "repClass", - "repFamily", - "repStart", - "repEnd", - "repLeft", - ], - repeat, - ) - ] - ), - } - - # BED is zero-based, therefore, genoStart does not need +1 - bed_row = { - "genoName": repeat["genoName"], - "genoStart": repeat["genoStart"], - "genoEnd": repeat["genoEnd"], - "swScore": repeat["swScore"], - "strand": repeat["strand"], - "name": repeat["repName"], - } - - gtf_writer.writerow(gtf_row) - bed_writer.writerow(bed_row) + base_url = "https://api.genome.ucsc.edu/getData/track" + + counter = 0 + nchromosomes = 0 + + for chrom in chromosome_dict["chromosomes"]: + nchromosomes += 1 + + # setup query string + params = {"genome": genome, "track": track_name, "chrom": chrom} + logger.debug("Query string = {0}".format(params)) + + # create request object + url = build_request(base_url, params, logger) + + logger.debug("Starting GTF/BED export for chromosome {0}".format(chrom)) + + # perform the query and parse response + chrom_lines = 0 + try: + with request.urlopen(url) as response: + dat = json.loads(response.read()) + + logger.info( + "Downloaded {0} - {1} repeats - track name {2} - url {3}".format( + chrom, len(dat["rmsk"]), track_name, url.get_full_url() + ) + ) + + for repeat in dat["rmsk"]: + + # UCSC works zero-based base position + # https://www.biostars.org/p/84686/ + + write_gtf_row(gtf_writer, repeat) + write_bed_row(bed_writer, repeat) + + chrom_lines += 1 + counter += 1 + + logger.debug( + "Completed {0} - written {1} lines.".format(chrom, chrom_lines) + ) + + sleep(randint(1, 3)) + + except HTTPError as e: + logger.critical(f"HTTPError: {e.code} - {e.reason}") + raise e + except URLError as e: + logger.critical(f"URLError: {e.reason}") + raise e + + logger.info( + "Successfully written {0} lines over {1} chromosomes.".format( + counter, nchromosomes + ) + ) if __name__ == "__main__": + logger = setup_logger(str(snakemake.log)) + genome = snakemake.params["genome_id"] + logger.info("Genome = {0}".format(genome)) + track_name = "rmsk" + logger.info("Track = {0}".format(track_name)) + gtf_output = open(str(snakemake.output[0]), "w") + logger.info("Output path = {0}".format(gtf_output)) + bed_output = open(str(snakemake.output[1]), "w") + logger.info("Bed output = {0}".format(bed_output)) - main(genome, track_name, gtf_output, bed_output) + main( + genome=genome, + track_name=track_name, + gtf_output=gtf_output, + bed_output=bed_output, + logger=logger, + )