diff --git a/definitions/pipelines/rnaseq.cwl b/definitions/pipelines/rnaseq.cwl index c42f23a9..5961d59c 100644 --- a/definitions/pipelines/rnaseq.cwl +++ b/definitions/pipelines/rnaseq.cwl @@ -150,8 +150,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: index_bam/indexed_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] stringtie: diff --git a/definitions/pipelines/rnaseq_star_fusion.cwl b/definitions/pipelines/rnaseq_star_fusion.cwl index a9170cb5..789e6806 100644 --- a/definitions/pipelines/rnaseq_star_fusion.cwl +++ b/definitions/pipelines/rnaseq_star_fusion.cwl @@ -238,8 +238,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: sort_bam/sorted_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] index_bam: diff --git a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl index 1537d435..9d47f552 100644 --- a/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl +++ b/definitions/pipelines/rnaseq_star_fusion_with_xenosplit.cwl @@ -257,8 +257,6 @@ steps: run: ../tools/mark_duplicates_and_sort.cwl in: bam: sort_bam/sorted_bam - input_sort_order: - default: "coordinate" out: [sorted_bam, metrics_file] index_bam: diff --git a/definitions/tools/generate_fda_tables.cwl b/definitions/tools/generate_fda_tables.cwl index 93ca2421..d81dc714 100644 --- a/definitions/tools/generate_fda_tables.cwl +++ b/definitions/tools/generate_fda_tables.cwl @@ -4,7 +4,7 @@ class: CommandLineTool label: "Script to create FDA-requested summary tables" requirements: - class: DockerRequirement - dockerPull: "python:3.7.4-slim-buster" + dockerPull: "python:3.10.8-slim-buster" - class: ResourceRequirement ramMin: 8000 - class: InitialWorkDirRequirement @@ -232,9 +232,25 @@ requirements: def parse_duplication_metrics(duplication_metrics): with open(duplication_metrics) as f: - raw_chunk = f.read().split('\n\n')[1] - pct_dup = raw_chunk.splitlines()[2].split('\t')[8] - return {'PERCENT_DUPLICATION': pct_dup} + pairs = None + singles = None + duplicates = None + lines = f.read().splitlines() + for line in lines: + if match_pairs := re.search(r'sorted (\d+) end pairs', line): + pairs = match_pairs.group(1) + elif match_singles := re.search(r'and (\d+) single ends', line): + singles = match_singles.group(1) + elif match_duplicates := re.search(r'found (\d+) duplicates', line): + duplicates = match_duplicates.group(1) + if pairs is None: + raise ValueError('Failed to parse number of end pairs') + if singles is None: + raise ValueError('Failed to parse number of single ends') + if duplicates is None: + raise ValueError('Failed to parse number of duplicates') + + return {'PERCENT_DUPLICATION': str(float(duplicates)/(2.0*float(pairs) + float(singles))*100.0)} def parse_insert_size_metrics(insert_size_metrics): with open(insert_size_metrics) as f: diff --git a/definitions/tools/mark_duplicates_and_sort.cwl b/definitions/tools/mark_duplicates_and_sort.cwl index 79097442..13a0f85d 100644 --- a/definitions/tools/mark_duplicates_and_sort.cwl +++ b/definitions/tools/mark_duplicates_and_sort.cwl @@ -7,10 +7,10 @@ label: "Mark duplicates and Sort" baseCommand: ["/bin/bash", "markduplicates_helper.sh"] requirements: - class: ResourceRequirement - coresMin: 8 + coresMin: 16 ramMin: 40000 - class: DockerRequirement - dockerPull: "mgibio/mark_duplicates-cwl:1.0.1" + dockerPull: "quay.io/biocontainers/sambamba:0.8.2--h98b6b92_2" - class: InitialWorkDirRequirement listing: - entryname: 'markduplicates_helper.sh' @@ -18,13 +18,11 @@ requirements: set -o pipefail set -o errexit - declare MD_BARCODE_TAG - if [ ! -z "$6" ]; then - MD_BARCODE_TAG="BARCODE_TAG=$6" - /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT "$MD_BARCODE_TAG" | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin - else - /usr/bin/java -Xmx16g -jar /opt/picard/picard.jar MarkDuplicates I=$1 O=/dev/stdout ASSUME_SORT_ORDER=$5 METRICS_FILE=$4 QUIET=true COMPRESSION_LEVEL=0 VALIDATION_STRINGENCY=LENIENT | /usr/bin/sambamba sort -t $2 -m 18G -o $3 /dev/stdin - fi + CORES="$2" + CORES_PER_JOB=`perl -E 'my $x = int($ARGV[0]/2); say($x < 1? 1 : $x)'` $CORES + + sambamba markdup -l 0 -t $CORES_PER_JOB "$1" /dev/stdout 2> "$4" \ + | sambamba sort -t $CORES_PER_JOB -m 16G -o "$3" /dev/stdin arguments: - position: 2 valueFrom: "$(runtime.cores)" @@ -35,11 +33,6 @@ inputs: type: File inputBinding: position: 1 - input_sort_order: - type: string - default: "queryname" - inputBinding: - position: 5 output_name: type: string? default: 'MarkedSorted.bam'