nf-core · nictru · Nov 19, 2023 · Nov 20, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/bin/annotate_outputs.sh b/bin/annotate_outputs.sh
@@ -1,143 +1,120 @@
 #!/usr/bin/env bash
 
-echo "===================================================================================="
-echo "[nf-core/circrna]: circRNA annotation script                                        "
-echo "[nf-core/circrna]: Author: Barry Digby                                              "
-echo "[nf-core/circrna]: Institution: National University of Ireland, Galway              "
-echo "[nf-core/circrna]:                                                                  "
-echo "[nf-core/circrna]: MIT license                                                      "
-echo "===================================================================================="
-
-mkdir -p bed12
-
 EB=$1
 
-while IFS='' read -r line; do
-
-    name=$(echo $line | awk '{print $4}')
-    count=$(echo $line | awk '{print $5}')
-    touch ${name}.bed
-    echo "$line" >> ${name}.bed_tmp
-    sed 's/[\t]*$//' ${name}.bed_tmp > ${name}.bed && rm ${name}.bed_tmp
+chr=$(echo $2 | awk '{print $1}')
+start=$(echo $2 | awk '{print $2}')
+stop=$(echo $2 | awk '{print $3}')
+strand=$(echo $2 | awk '{print $4}')
 
-    bedtools intersect -a filt.gtf -b ${name}.bed -s -f 1.00 > ${name}.gtf
+name="$chr:$start-$stop:$strand"
+count=0
+touch ${name}.bed
+echo -e "$chr\t$start\t$stop\t$name\t$count\t$strand" >> ${name}.bed_tmp
+sed 's/[\t]*$//' ${name}.bed_tmp > ${name}.bed && rm ${name}.bed_tmp
 
-    start=$(echo $line | awk '{print $2}')
-    stop=$(echo $line | awk '{print $3}')
+bedtools intersect -a filt.gtf -b ${name}.bed -f 1.00 > ${name}.gtf
 
-    echo "[nf-core/circrna]: Starting analysis for: $name"
-
-    # is the gtf file NOT (-s) empty? i.e did it overlap biotypes?
-    if [[ -s ${name}.gtf ]];
-    then
+echo "[nf-core/circrna]: Starting analysis for: $name"
 
-        echo "[nf-core/circrna]: $name overlaps features in GTF file"
-        echo "[nf-core/circrna]: Inspecting Genes..."
+# is the gtf file NOT (-s) empty? i.e did it overlap biotypes?
+if [[ -s ${name}.gtf ]];
+then
 
-        gene_id=$(awk -F'gene_id ' '{print $2}' ${name}.gtf | \
-        awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
+    echo "[nf-core/circrna]: $name overlaps features in GTF file"
+    echo "[nf-core/circrna]: Inspecting Genes..."
 
-        tx_id=$(awk -F'transcript_id ' '{print $2}' ${name}.gtf | \
-        awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
+    gene_id=$(awk -F'gene_id ' '{print $2}' ${name}.gtf | \
+    awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
 
-        echo "[nf-core/circrna]: Overlapping Gene IDs: $gene_id"
-        echo "[nf-core/circrna]: Converting to BED12"
+    tx_id=$(awk -F'transcript_id ' '{print $2}' ${name}.gtf | \
+    awk -F';' '{print $1}' | sed 's/"//g' | sort -u | paste -s -d, -)
 
-        gtfToGenePred ${name}.gtf ${name}.genepred
-        genePredToBed ${name}.genepred ${name}_predtobed.bed
-
-        # Attempting perfect exon boundary overlaps
-        echo "[nf-core/circrna]: Attempting to fit circRNA to gene exon boundaries"
-        awk -v OFS="\t" -v start="$start" -v stop="$stop" \
-        '{if($2==start && $3==stop) print $0}' ${name}_predtobed.bed | \
-        sort -rnk10 | head -n 1 > ${name}.bed12.bed
-
-        # Resulting file not empty? i.e perfectly overlapped with exon boundaries?
-        if [[ -s ${name}.bed12.bed ]];
-        then
-            echo "[nf-core/circrna]: ${name} fits gene exons, is a circRNA"
-            type="circRNA"
-        else
+    echo "[nf-core/circrna]: Overlapping Gene IDs: $gene_id"
+    echo "[nf-core/circrna]: Converting to BED12"
 
-            echo "[nf-core/circrna]: circRNA overlaps exons, but not boundaries"
-            echo "[nf-core/circrna]: Investigating if EIciRNA or acceptable to take underlying transcript"
-            echo "[nf-core/circrna]: Retrying with longest underlying transcript"
+    gtfToGenePred ${name}.gtf ${name}.genepred
+    genePredToBed ${name}.genepred ${name}_predtobed.bed
 
-            awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
-            sort -rnk13 | cut -f13 --complement | head -n 1 > ${name}.bed12.bed_tmp
+    # Attempting perfect exon boundary overlaps
+    echo "[nf-core/circrna]: Attempting to fit circRNA to gene exon boundaries"
+    awk -v OFS="\t" -v start="$start" -v stop="$stop" \
+    '{if($2==start && $3==stop) print $0}' ${name}_predtobed.bed | \
+    sort -rnk10 | head -n 1 > ${name}.bed12.bed
 
-            tx_len=$(awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
-            sort -rnk13 | awk '{print $13}' | head -n 1)
+    # Resulting file not empty? i.e perfectly overlapped with exon boundaries?
+    if [[ -s ${name}.bed12.bed ]];
+    then
+        echo "[nf-core/circrna]: ${name} fits gene exons, is a circRNA"
+        type="circRNA"
+    else
 
-            circ_len=$(awk -v OFS="\t" '{$7 = $3 - $2; print}' ${name}.bed | awk '{print $7}')
+        echo "[nf-core/circrna]: circRNA overlaps exons, but not boundaries"
+        echo "[nf-core/circrna]: Investigating if EIciRNA or acceptable to take underlying transcript"
+        echo "[nf-core/circrna]: Retrying with longest underlying transcript"
 
-            echo "[nf-core/circrna]: Best transcript length: $tx_len"
-            echo "[nf-core/circrna]: $name length: $circ_len"
+        awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
+        sort -rnk13 | cut -f13 --complement | head -n 1 > ${name}.bed12.bed_tmp
 
-            difference=$(($circ_len - $tx_len))
+        tx_len=$(awk -v OFS="\t" '{$13 = $3 - $2; print}' ${name}_predtobed.bed | \
+        sort -rnk13 | awk '{print $13}' | head -n 1)
 
-            if [[ $difference -gt $EB ]];
-            then
+        circ_len=$(awk -v OFS="\t" '{$7 = $3 - $2; print}' ${name}.bed | awk '{print $7}')
 
-                echo "[nf-core/circrna]: Transcript exon boundaries more than ${EB}bp off $name"
-                echo "[nf-core/circrna]: Treating as EIciRNA"
+        echo "[nf-core/circrna]: Best transcript length: $tx_len"
+        echo "[nf-core/circrna]: $name length: $circ_len"
 
-                type="EIciRNA"
-                block_count=1
-                block_size=$(($stop-$start))
-                rgb="0,0,0"
-                block_start=0
-                awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
-                '{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
-                rm ${name}.bed12.bed_tmp
-            else
+        difference=$(($circ_len - $tx_len))
 
-                echo "[nf-core/circrna]: Transcript exon boundaries within ${EB}bp ${name}"
-                echo "[nf-core/circrna]: Treating ${name} as circRNA."
-                type="circRNA"
-                mv ${name}.bed12.bed_tmp ${name}.bed12.bed
-            fi
-        fi
-        else
+        if [[ $difference -gt $EB ]];
+        then
 
-            echo "[nf-core/circrna]: $name returned no GTF overlaps."
-            echo "[nf-core/circrna]: Treating as an intronic circRNA"
+            echo "[nf-core/circrna]: Transcript exon boundaries more than ${EB}bp off $name"
+            echo "[nf-core/circrna]: Treating as EIciRNA"
 
-            gene_id="NA"
-            tx_id="NA"
-            type="ciRNA"
+            type="EIciRNA"
             block_count=1
             block_size=$(($stop-$start))
             rgb="0,0,0"
             block_start=0
             awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
             '{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
+            rm ${name}.bed12.bed_tmp
+        else
+
+            echo "[nf-core/circrna]: Transcript exon boundaries within ${EB}bp ${name}"
+            echo "[nf-core/circrna]: Treating ${name} as circRNA."
+            type="circRNA"
+            mv ${name}.bed12.bed_tmp ${name}.bed12.bed
+        fi
     fi
-    # add type, geneid tx_id and count
-    awk -v type="$type" -v gene="$gene_id" -v tx="$tx_id" -v count="$count" 'BEGIN{FS=OFS="\t"}{$5=count;$13=type;$14=gene;$15=tx}1' ${name}.bed12.bed > ${name}.bed12.bed_tmp
-    awk -v OFS="\t" -v name=$name '{$4 = name; print}' ${name}.bed12.bed_tmp > ${name}.bed12.bed_tmp1
-
-    rm ${name}.bed12.bed
-    rm ${name}.bed12.bed_tmp
-    mv ${name}.bed12.bed_tmp1 ${name}.bed12.bed
-    echo "[nf-core/circrna]: cleaning up intermediate files"
-    rm -f ${name}.gtf
-    rm -f ${name}.genepred
-    rm -f ${name}_predtobed.bed
-    rm -f ${name}.bed
-
-    cp ${name}.bed12.bed bed12/
-    rm -rf ${name}.bed12.bed
-    echo "===================================================================================="
-    echo "===================================================================================="
-
-done < circs.bed
-
-# remove the trailing commas that appear on end of Exon Size, Exon Starts (col 11, 12)
-cat bed12/*.bed12.bed > master_bed12.bed.tmp
-
-awk 'BEGIN{FS=OFS="\t"} {gsub(/,$/,"",$11);gsub(/,$/,"",$12)} 1' master_bed12.bed.tmp > master_bed12.bed && rm master_bed12.bed.tmp
-
-echo "                   Thank you for using nf-core/circrna! - Barry                     "
-echo "===================================================================================="
-echo "===================================================================================="
+    else
+
+        echo "[nf-core/circrna]: $name returned no GTF overlaps."
+        echo "[nf-core/circrna]: Treating as an intronic circRNA"
+
+        gene_id="NA"
+        tx_id="NA"
+        type="ciRNA"
+        block_count=1
+        block_size=$(($stop-$start))
+        rgb="0,0,0"
+        block_start=0
+        awk -v OFS="\t" -v thick=$start -v rgb=$rgb -v count=$block_count -v start=$block_start -v size=$block_size \
+        '{print $0, thick, thick, rgb, count, size, start}' ${name}.bed > ${name}.bed12.bed
+fi
+# add type, geneid tx_id and count
+awk -v type="$type" -v gene="$gene_id" -v tx="$tx_id" -v count="$count" 'BEGIN{FS=OFS="\t"}{$5=count;$13=type;$14=gene;$15=tx}1' ${name}.bed12.bed > ${name}.bed12.bed_tmp
+awk -v OFS="\t" -v name=$name '{$4 = name; print}' ${name}.bed12.bed_tmp > ${name}.bed12.bed_tmp1
+
+rm ${name}.bed12.bed
+rm ${name}.bed12.bed_tmp
+mv ${name}.bed12.bed_tmp1 ${name}.bed12.bed
+echo "[nf-core/circrna]: cleaning up intermediate files"
+rm -f ${name}.gtf
+rm -f ${name}.genepred
+rm -f ${name}_predtobed.bed
+rm -f ${name}.bed
+
+cp ${name}.bed12.bed bed12/
+rm -rf ${name}.bed12.bed
diff --git a/conf/modules.config b/conf/modules.config
@@ -596,9 +596,30 @@ if (!params.skip_trimming) {
         ]
     }
 
+    withName: REMOVE_HEADER {
+        ext.args = "'{if (NR>1) {print}}'"
+        ext.suffix = "noheader.tsv"
+    }
+
+    withName: KEEP_FOUR_COLUMNS {
+        // Use tab as field separator
+        ext.args = "'BEGIN {FS=\"\t\"} {print \$1\"\t\"\$2\"\t\"\$3\"\t\"\$4}'"
+        ext.suffix = "four_columns.tsv"
+    }
+
+    withName: INTERSECT_ANNOTATION {
+        ext.args = "-wo"
+        ext.suffix = "intersect.bed"
+    }
+
+    withName: INV_INTERSECT_ANNOTATION {
+        ext.args = "-v"
+        ext.suffix = "inv_intersect.bed"
+    }
+
     withName: ANNOTATION {
         publishDir = [
-            path: { "${params.outdir}/circrna_discovery/${meta.tool}/${meta.id}" },
+            path: { "${params.outdir}/circrna_discovery/${meta.id}" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
@@ -607,7 +628,7 @@ if (!params.skip_trimming) {
     withName: FASTA {
         ext.when = { params.module.split(',').contains('circrna_discovery') }
         publishDir = [
-            path: { "${params.outdir}/circrna_discovery/${meta.tool}/${meta.id}" },
+            path: { "${params.outdir}/circrna_discovery/${meta.id}" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             pattern: "*.fasta"
@@ -670,7 +691,7 @@ if (!params.skip_trimming) {
     withName: MIRNA_TARGETS {
         ext.when = { params.module.split(',').contains('mirna_prediction') }
         publishDir = [
-            path: { "${params.outdir}/mirna_prediction/${meta.tool}" },
+            path: { "${params.outdir}/mirna_prediction" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             pattern: "*.txt"

diff --git a/docker/annotation/Dockerfile b/docker/annotation/Dockerfile
@@ -0,0 +1,3 @@
+FROM continuumio/miniconda3:23.9.0-0
+
+RUN conda install -y -c bioconda -c conda-forge ucsc-gtftogenepred=377 ucsc-genepredtobed=377 bedtools=2.27.0 parallel=20230922
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -0,0 +1,5 @@
+services:
+  annotation:
+    build:
+      context: annotation
+    image: bigdatainbiomedicine/circ-annotation
diff --git a/modules/local/annotation/annotation/main.nf b/modules/local/annotation/annotation/main.nf
@@ -0,0 +1,80 @@
+process ANNOTATION {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::pandas=1.5.2"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
+
+    input:
+    tuple val(meta), path(intersection)
+    tuple val(meta2), path(inv_intersection)
+
+    output:
+    tuple val(meta), path("annotation.bed"), emit: bed
+    tuple val(meta2), path("annotation.tsv"), emit: locstring
+
+    script:
+    """
+    #!/usr/bin/env python
+
+    import pandas as pd
+    import os
+
+    columns = {
+        0: 'chr',
+        1: 'start',
+        2: 'end',
+        3: 'strand',
+        12: 'attributes',
+        13: 'overlap'
+    }
+
+    attributes = ['gene_id', 'transcript_id']
+
+    df = pd.read_csv('$intersection', sep='\\t', header=None, usecols=columns.keys())
+    df = df.rename(columns=columns)
+
+    # Convert attributes to a dictionary
+    df['attributes'] = df['attributes'].apply(lambda row: dict([[value.strip(r'\"') for value in entry.strip().split(' ')] for entry in row.split(';') if entry]))
+    # Keep only the attributes we want
+    df['attributes'] = df['attributes'].apply(lambda row: {key: row[key] for key in attributes})
+    # Convert attributes to columns
+    df = pd.concat([df.drop(['attributes'], axis=1), df['attributes'].apply(pd.Series)], axis=1)
+
+    df = df.groupby(['chr', 'start', 'end', 'strand']).aggregate(lambda x: set(x))
+
+    df['length'] = df.index.map(lambda row: row[2] - row[1])
+    # Check if there is an overlap entry with the same length as the feature
+    df['perfect'] = df.apply(lambda row: row['length'] in row['overlap'], axis=1)
+    # Drop the overlap and length columns
+    df = df.drop(['overlap', 'length'], axis=1)
+    # Kee only unique gene_ids
+    df['gene_id'] = df['gene_id'].apply(lambda row: ','.join(row))
+    df['transcript_id'] = df['transcript_id'].apply(lambda row: ','.join(row))
+
+    # If 'perfect': circRNA, else EIciRNA
+    df['type'] = df['perfect'].apply(lambda perfect: 'circRNA' if perfect else 'EIciRNA')
+    # Drop the perfect column
+    df = df.drop(['perfect'], axis=1)
+
+    if not os.stat('$inv_intersection').st_size == 0:
+        df_nooverlap = pd.read_csv('$inv_intersection', sep='\\t', header=None)
+        df_nooverlap.columns = ['chr', 'start', 'end', 'strand']
+        df_nooverlap['type'] = 'ciRNA'
+        df_nooverlap['gene_id'] = 'NA'
+
+        # Set chr, start, end and strand as index
+        df_nooverlap = df_nooverlap.set_index(['chr', 'start', 'end', 'strand'])
+
+        df = pd.concat([df, df_nooverlap])
+
+    df = df.sort_index()
+
+    df.to_csv('annotation.bed', sep='\\t', index=True, header=False)
+
+    df.index = df.index.map(lambda row: "{}:{}-{}:{}".format(*row))
+    df.to_csv('annotation.tsv', sep='\\t', index=True, header=True)
+    """
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM continuumio/miniconda3:23.9.0-0

		RUN conda install -y -c bioconda -c conda-forge ucsc-gtftogenepred=377 ucsc-genepredtobed=377 bedtools=2.27.0 parallel=20230922