Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annotation parallelization #84

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 89 additions & 112 deletions bin/annotate_outputs.sh
Original file line number Diff line number Diff line change
@@ -1,143 +1,120 @@
#!/usr/bin/env bash

echo "===================================================================================="
echo "[nf-core/circrna]: circRNA annotation script "
echo "[nf-core/circrna]: Author: Barry Digby "
echo "[nf-core/circrna]: Institution: National University of Ireland, Galway "
echo "[nf-core/circrna]: "
echo "[nf-core/circrna]: MIT license "
echo "===================================================================================="

mkdir -p bed12

EB=$1

while IFS='' read -r line; do

name=$(echo $line | awk '{print $4}')
count=$(echo $line | awk '{print $5}')
touch ${name}.bed
echo "$line" >> ${name}.bed_tmp
sed 's/[\t]*$//' ${name}.bed_tmp > ${name}.bed && rm ${name}.bed_tmp
chr=$(echo $2 | awk '{print $1}')
start=$(echo $2 | awk '{print $2}')
stop=$(echo $2 | awk '{print $3}')
strand=$(echo $2 | awk '{print $4}')

bedtools intersect -a filt.gtf -b ${name}.bed -s -f 1.00 > ${name}.gtf
name="$chr:$start-$stop:$strand"
count=0
touch ${name}.bed
echo -e "$chr\t$start\t$stop\t$name\t$count\t$strand" >> ${name}.bed_tmp
sed 's/[\t]*$//' ${name}.bed_tmp > ${name}.bed && rm ${name}.bed_tmp

start=$(echo $line | awk '{print $2}')
stop=$(echo $line | awk '{print $3}')
bedtools intersect -a filt.gtf -b ${name}.bed -f 1.00 > ${name}.gtf

echo "[nf-core/circrna]: Starting analysis for: $name"

# is the gtf file NOT (-s) empty? i.e did it overlap biotypes?
if [[ -s ${name}.gtf ]];
then
echo "[nf-core/circrna]: Starting analysis for: $name"

echo "[nf-core/circrna]: $name overlaps features in GTF file"
echo "[nf-core/circrna]: Inspecting Genes..."
# is the gtf file NOT (-s) empty? i.e did it overlap biotypes?
if [[ -s ${name}.gtf ]];
then

gene_id=$(awk -F'gene_id ' '{print $2}' ${name}.gtf | \
awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
echo "[nf-core/circrna]: $name overlaps features in GTF file"
echo "[nf-core/circrna]: Inspecting Genes..."

tx_id=$(awk -F'transcript_id ' '{print $2}' ${name}.gtf | \
awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
gene_id=$(awk -F'gene_id ' '{print $2}' ${name}.gtf | \
awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)

echo "[nf-core/circrna]: Overlapping Gene IDs: $gene_id"
echo "[nf-core/circrna]: Converting to BED12"
tx_id=$(awk -F'transcript_id ' '{print $2}' ${name}.gtf | \
awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)

gtfToGenePred ${name}.gtf ${name}.genepred
genePredToBed ${name}.genepred ${name}_predtobed.bed

# Attempting perfect exon boundary overlaps
echo "[nf-core/circrna]: Attempting to fit circRNA to gene exon boundaries"
awk -v OFS="\t" -v start="$start" -v stop="$stop" \
'{if($2==start && $3==stop) print $0}' ${name}_predtobed.bed | \
sort -rnk10 | head -n 1 > ${name}.bed12.bed

# Resulting file not empty? i.e perfectly overlapped with exon boundaries?
if [[ -s ${name}.bed12.bed ]];
then
echo "[nf-core/circrna]: ${name} fits gene exons, is a circRNA"
type="circRNA"
else
echo "[nf-core/circrna]: Overlapping Gene IDs: $gene_id"
echo "[nf-core/circrna]: Converting to BED12"

echo "[nf-core/circrna]: circRNA overlaps exons, but not boundaries"
echo "[nf-core/circrna]: Investigating if EIciRNA or acceptable to take underlying transcript"
echo "[nf-core/circrna]: Retrying with longest underlying transcript"
gtfToGenePred ${name}.gtf ${name}.genepred
genePredToBed ${name}.genepred ${name}_predtobed.bed

awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
sort -rnk13 | cut -f13 --complement | head -n 1 > ${name}.bed12.bed_tmp
# Attempting perfect exon boundary overlaps
echo "[nf-core/circrna]: Attempting to fit circRNA to gene exon boundaries"
awk -v OFS="\t" -v start="$start" -v stop="$stop" \
'{if($2==start && $3==stop) print $0}' ${name}_predtobed.bed | \
sort -rnk10 | head -n 1 > ${name}.bed12.bed

tx_len=$(awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
sort -rnk13 | awk '{print $13}' | head -n 1)
# Resulting file not empty? i.e perfectly overlapped with exon boundaries?
if [[ -s ${name}.bed12.bed ]];
then
echo "[nf-core/circrna]: ${name} fits gene exons, is a circRNA"
type="circRNA"
else

circ_len=$(awk -v OFS="\t" '{$7 = $3 - $2; print}' ${name}.bed | awk '{print $7}')
echo "[nf-core/circrna]: circRNA overlaps exons, but not boundaries"
echo "[nf-core/circrna]: Investigating if EIciRNA or acceptable to take underlying transcript"
echo "[nf-core/circrna]: Retrying with longest underlying transcript"

echo "[nf-core/circrna]: Best transcript length: $tx_len"
echo "[nf-core/circrna]: $name length: $circ_len"
awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
sort -rnk13 | cut -f13 --complement | head -n 1 > ${name}.bed12.bed_tmp

difference=$(($circ_len - $tx_len))
tx_len=$(awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
sort -rnk13 | awk '{print $13}' | head -n 1)

if [[ $difference -gt $EB ]];
then
circ_len=$(awk -v OFS="\t" '{$7 = $3 - $2; print}' ${name}.bed | awk '{print $7}')

echo "[nf-core/circrna]: Transcript exon boundaries more than ${EB}bp off $name"
echo "[nf-core/circrna]: Treating as EIciRNA"
echo "[nf-core/circrna]: Best transcript length: $tx_len"
echo "[nf-core/circrna]: $name length: $circ_len"

type="EIciRNA"
block_count=1
block_size=$(($stop-$start))
rgb="0,0,0"
block_start=0
awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
'{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
rm ${name}.bed12.bed_tmp
else
difference=$(($circ_len - $tx_len))

echo "[nf-core/circrna]: Transcript exon boundaries within ${EB}bp ${name}"
echo "[nf-core/circrna]: Treating ${name} as circRNA."
type="circRNA"
mv ${name}.bed12.bed_tmp ${name}.bed12.bed
fi
fi
else
if [[ $difference -gt $EB ]];
then

echo "[nf-core/circrna]: $name returned no GTF overlaps."
echo "[nf-core/circrna]: Treating as an intronic circRNA"
echo "[nf-core/circrna]: Transcript exon boundaries more than ${EB}bp off $name"
echo "[nf-core/circrna]: Treating as EIciRNA"

gene_id="NA"
tx_id="NA"
type="ciRNA"
type="EIciRNA"
block_count=1
block_size=$(($stop-$start))
rgb="0,0,0"
block_start=0
awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
'{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
rm ${name}.bed12.bed_tmp
else

echo "[nf-core/circrna]: Transcript exon boundaries within ${EB}bp ${name}"
echo "[nf-core/circrna]: Treating ${name} as circRNA."
type="circRNA"
mv ${name}.bed12.bed_tmp ${name}.bed12.bed
fi
fi
# add type, geneid tx_id and count
awk -v type="$type" -v gene="$gene_id" -v tx="$tx_id" -v count="$count" 'BEGIN{FS=OFS="\t"}{$5=count;$13=type;$14=gene;$15=tx}1' ${name}.bed12.bed > ${name}.bed12.bed_tmp
awk -v OFS="\t" -v name=$name '{$4 = name; print}' ${name}.bed12.bed_tmp > ${name}.bed12.bed_tmp1

rm ${name}.bed12.bed
rm ${name}.bed12.bed_tmp
mv ${name}.bed12.bed_tmp1 ${name}.bed12.bed
echo "[nf-core/circrna]: cleaning up intermediate files"
rm -f ${name}.gtf
rm -f ${name}.genepred
rm -f ${name}_predtobed.bed
rm -f ${name}.bed

cp ${name}.bed12.bed bed12/
rm -rf ${name}.bed12.bed
echo "===================================================================================="
echo "===================================================================================="

done < circs.bed

# remove the trailing commas that appear on end of Exon Size, Exon Starts (col 11, 12)
cat bed12/*.bed12.bed > master_bed12.bed.tmp

awk 'BEGIN{FS=OFS="\t"} {gsub(/,$/,"",$11);gsub(/,$/,"",$12)} 1' master_bed12.bed.tmp > master_bed12.bed && rm master_bed12.bed.tmp

echo " Thank you for using nf-core/circrna! - Barry "
echo "===================================================================================="
echo "===================================================================================="
else

echo "[nf-core/circrna]: $name returned no GTF overlaps."
echo "[nf-core/circrna]: Treating as an intronic circRNA"

gene_id="NA"
tx_id="NA"
type="ciRNA"
block_count=1
block_size=$(($stop-$start))
rgb="0,0,0"
block_start=0
awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
'{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
fi
# add type, geneid tx_id and count
awk -v type="$type" -v gene="$gene_id" -v tx="$tx_id" -v count="$count" 'BEGIN{FS=OFS="\t"}{$5=count;$13=type;$14=gene;$15=tx}1' ${name}.bed12.bed > ${name}.bed12.bed_tmp
awk -v OFS="\t" -v name=$name '{$4 = name; print}' ${name}.bed12.bed_tmp > ${name}.bed12.bed_tmp1

rm ${name}.bed12.bed
rm ${name}.bed12.bed_tmp
mv ${name}.bed12.bed_tmp1 ${name}.bed12.bed
echo "[nf-core/circrna]: cleaning up intermediate files"
rm -f ${name}.gtf
rm -f ${name}.genepred
rm -f ${name}_predtobed.bed
rm -f ${name}.bed

cp ${name}.bed12.bed bed12/
rm -rf ${name}.bed12.bed
27 changes: 24 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,30 @@ if (!params.skip_trimming) {
]
}

withName: REMOVE_HEADER {
ext.args = "'{if (NR>1) {print}}'"
ext.suffix = "noheader.tsv"
}

withName: KEEP_FOUR_COLUMNS {
// Use tab as field separator
ext.args = "'BEGIN {FS=\"\t\"} {print \$1\"\t\"\$2\"\t\"\$3\"\t\"\$4}'"
ext.suffix = "four_columns.tsv"
}

withName: INTERSECT_ANNOTATION {
ext.args = "-wo"
ext.suffix = "intersect.bed"
}

withName: INV_INTERSECT_ANNOTATION {
ext.args = "-v"
ext.suffix = "inv_intersect.bed"
}

withName: ANNOTATION {
publishDir = [
path: { "${params.outdir}/circrna_discovery/${meta.tool}/${meta.id}" },
path: { "${params.outdir}/circrna_discovery/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
Expand All @@ -607,7 +628,7 @@ if (!params.skip_trimming) {
withName: FASTA {
ext.when = { params.module.split(',').contains('circrna_discovery') }
publishDir = [
path: { "${params.outdir}/circrna_discovery/${meta.tool}/${meta.id}" },
path: { "${params.outdir}/circrna_discovery/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: "*.fasta"
Expand Down Expand Up @@ -670,7 +691,7 @@ if (!params.skip_trimming) {
withName: MIRNA_TARGETS {
ext.when = { params.module.split(',').contains('mirna_prediction') }
publishDir = [
path: { "${params.outdir}/mirna_prediction/${meta.tool}" },
path: { "${params.outdir}/mirna_prediction" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
pattern: "*.txt"
Expand Down
3 changes: 3 additions & 0 deletions docker/annotation/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM continuumio/miniconda3:23.9.0-0

RUN conda install -y -c bioconda -c conda-forge ucsc-gtftogenepred=377 ucsc-genepredtobed=377 bedtools=2.27.0 parallel=20230922
5 changes: 5 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
services:
annotation:
build:
context: annotation
image: bigdatainbiomedicine/circ-annotation
80 changes: 80 additions & 0 deletions modules/local/annotation/annotation/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
process ANNOTATION {
tag "$meta.id"
label 'process_single'

conda "bioconda::pandas=1.5.2"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
'quay.io/biocontainers/pandas:1.5.2' }"

input:
tuple val(meta), path(intersection)
tuple val(meta2), path(inv_intersection)

output:
tuple val(meta), path("annotation.bed"), emit: bed
tuple val(meta2), path("annotation.tsv"), emit: locstring

script:
"""
#!/usr/bin/env python

import pandas as pd
import os

columns = {
0: 'chr',
1: 'start',
2: 'end',
3: 'strand',
12: 'attributes',
13: 'overlap'
}

attributes = ['gene_id', 'transcript_id']

df = pd.read_csv('$intersection', sep='\\t', header=None, usecols=columns.keys())
df = df.rename(columns=columns)

# Convert attributes to a dictionary
df['attributes'] = df['attributes'].apply(lambda row: dict([[value.strip(r'\"') for value in entry.strip().split(' ')] for entry in row.split(';') if entry]))
# Keep only the attributes we want
df['attributes'] = df['attributes'].apply(lambda row: {key: row[key] for key in attributes})
# Convert attributes to columns
df = pd.concat([df.drop(['attributes'], axis=1), df['attributes'].apply(pd.Series)], axis=1)

df = df.groupby(['chr', 'start', 'end', 'strand']).aggregate(lambda x: set(x))

df['length'] = df.index.map(lambda row: row[2] - row[1])
# Check if there is an overlap entry with the same length as the feature
df['perfect'] = df.apply(lambda row: row['length'] in row['overlap'], axis=1)
# Drop the overlap and length columns
df = df.drop(['overlap', 'length'], axis=1)
# Kee only unique gene_ids
df['gene_id'] = df['gene_id'].apply(lambda row: ','.join(row))
df['transcript_id'] = df['transcript_id'].apply(lambda row: ','.join(row))

# If 'perfect': circRNA, else EIciRNA
df['type'] = df['perfect'].apply(lambda perfect: 'circRNA' if perfect else 'EIciRNA')
# Drop the perfect column
df = df.drop(['perfect'], axis=1)

if not os.stat('$inv_intersection').st_size == 0:
df_nooverlap = pd.read_csv('$inv_intersection', sep='\\t', header=None)
df_nooverlap.columns = ['chr', 'start', 'end', 'strand']
df_nooverlap['type'] = 'ciRNA'
df_nooverlap['gene_id'] = 'NA'

# Set chr, start, end and strand as index
df_nooverlap = df_nooverlap.set_index(['chr', 'start', 'end', 'strand'])

df = pd.concat([df, df_nooverlap])

df = df.sort_index()

df.to_csv('annotation.bed', sep='\\t', index=True, header=False)

df.index = df.index.map(lambda row: "{}:{}-{}:{}".format(*row))
df.to_csv('annotation.tsv', sep='\\t', index=True, header=True)
"""
}
Loading
Loading