From 381d47347203ebd92881c1a8d1621aa99784810d Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 28 Feb 2018 14:47:34 +0100 Subject: [PATCH 01/45] Cleaner pool iterable --- exfi/polish.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/exfi/polish.py b/exfi/polish.py index 0e0f3f5..d708e43 100644 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -170,12 +170,14 @@ def polish_splice_graph_dict( # pool.join() # Run + sg_fasta_pairs = ( + (splice_graph, {transcript_id: fasta_dict[transcript_id]}) + for transcript_id, splice_graph in splice_graph_dict.items() + ) + results = pool.starmap( func=polish_splice_graph, - iterable=( - (splice_graph_value, {splice_graph_key: fasta_dict[splice_graph_key]}) - for splice_graph_key, splice_graph_value in splice_graph_dict.items() - ), + iterable=sg_fasta_pairs, chunksize=1000 ) pool.close() From 5806927b14bd1221c5fdfe73166b9b7462928b22 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 28 Feb 2018 14:55:20 +0100 Subject: [PATCH 02/45] Added pipeline scripts for fast inspection --- scripts/pipeline_chr25.sh | 97 +++++++++++++++++++++++++++++++++++ scripts/pipeline_chr25_big.sh | 97 +++++++++++++++++++++++++++++++++++ scripts/pipeline_test.sh | 71 +++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 scripts/pipeline_chr25.sh create mode 100644 scripts/pipeline_chr25_big.sh create mode 100644 scripts/pipeline_test.sh diff --git a/scripts/pipeline_chr25.sh b/scripts/pipeline_chr25.sh new file mode 100644 index 0000000..73ded4b --- /dev/null +++ b/scripts/pipeline_chr25.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p results/ + +k=31 +m=4G +l=1 + +if [ ! -f results/drer25sim_k${k}_m${m}_l${l}.bloom ]; then +build_baited_bloom_filter \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --kmer $k \ + --bloom-size $m \ + --levels $l \ + --threads 4 \ + --output-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + data/drer25.1_1.fq.gz \ + data/drer25.1_2.fq.gz \ + data/drer25.2_1.fq.gz \ + data/drer25.2_2.fq.gz +fi + +if [ ! -f results/drer25real_k${k}_m${m}_l1.bloom ]; then +build_baited_bloom_filter \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --kmer $k \ + --bloom-size $m \ + --levels 1 \ + --threads 4 \ + --output-bloom results/drer25real_k${k}_m${m}_l1.bloom \ + data/Danio_rerio.GRCz10.dna.25.fa.gz +fi + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --output-gfa results/drer25sim.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --output-gfa results/drer25sim_correct.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --polish \ + --output-gfa results/drer25sim_correct_polish.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --polish \ + --collapse \ + --output-gfa results/drer25sim_correct_polish_collapse.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-bloom results/drer25real_k${k}_m${m}_l1.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --output-gfa results/drer25real.gfa \ + --verbose \ + --threads 4 + +gfa1_to_exons \ + --input-gfa results/drer25sim.gfa \ + --output-fasta results/drer25sim_exons.fa \ + --soft-mask-overlaps + +gfa1_to_exons \ + --input-gfa results/drer25real.gfa \ + --output-fasta results/drer25real_exons.fa \ + --soft-mask-overlaps + +bash src/pr_chr25.sh diff --git a/scripts/pipeline_chr25_big.sh b/scripts/pipeline_chr25_big.sh new file mode 100644 index 0000000..320041c --- /dev/null +++ b/scripts/pipeline_chr25_big.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -euxo pipefail + +mkdir -p results/ + +k=31 +m=4G +l=1 + +if [ ! -f results/drer25sim_k${k}_m${m}_l${l}.bloom ]; then +build_baited_bloom_filter \ + --input-fasta data/transcript.fa \ + --kmer $k \ + --bloom-size $m \ + --levels $l \ + --threads 4 \ + --output-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + data/drer25.1_1.fq.gz \ + data/drer25.1_2.fq.gz \ + data/drer25.2_1.fq.gz \ + data/drer25.2_2.fq.gz +fi + +if [ ! -f results/drer25real_k${k}_m${m}_l1.bloom ]; then +build_baited_bloom_filter \ + --input-fasta data/transcript.fa \ + --kmer $k \ + --bloom-size $m \ + --levels 1 \ + --threads 4 \ + --output-bloom results/drer25real_k${k}_m${m}_l1.bloom \ + data/Danio_rerio.GRCz10.dna.25.fa.gz +fi + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --output-gfa results/drer25sim.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --output-gfa results/drer25sim_correct.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --polish \ + --output-gfa results/drer25sim_correct_polish.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --polish \ + --collapse \ + --output-gfa results/drer25sim_correct_polish_collapse.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ + --input-bloom results/drer25real_k${k}_m${m}_l1.bloom \ + --kmer $k \ + --max-fp-bases 3 \ + --correct \ + --output-gfa results/drer25real.gfa \ + --verbose \ + --threads 4 + +gfa1_to_exons \ + --input-gfa results/drer25sim.gfa \ + --output-fasta results/drer25sim_exons.fa \ + --soft-mask-overlaps + +gfa1_to_exons \ + --input-gfa results/drer25real.gfa \ + --output-fasta results/drer25real_exons.fa \ + --soft-mask-overlaps + +bash src/pr_chr25.sh diff --git a/scripts/pipeline_test.sh b/scripts/pipeline_test.sh new file mode 100644 index 0000000..d955b4f --- /dev/null +++ b/scripts/pipeline_test.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +set -euxo pipefail + + +mkdir -p results/ + +if [ ! -f results/test_k27_m500M_l1.bloom ]; then + build_baited_bloom_filter \ + --input-fasta data/transcript.fa \ + --kmer-size 27 \ + --bloom-size 500M \ + --levels 1 \ + --threads 4 \ + --output-bloom results/test_k27_m500M_l1.bloom \ + data/genome.fa.gz +fi + +build_splice_graph \ + --input-fasta data/transcript.fa \ + --input-bloom results/test_k27_m500M_l1.bloom \ + --kmer-size 27 \ + --max-fp-bases 5 \ + --output-gfa results/test.gfa \ + --verbose \ + --threads 4 + +build_splice_graph \ + --input-fasta data/transcript.fa \ + --input-bloom results/test_k27_m500M_l1.bloom \ + --kmer-size 27 \ + --max-fp-bases 5 \ + --output-gfa results/test_correct.gfa \ + --verbose \ + --threads 4 \ + --correct + +build_splice_graph \ + --input-fasta data/transcript.fa \ + --input-bloom results/test_k27_m500M_l1.bloom \ + --kmer-size 27 \ + --max-fp-bases 5 \ + --output-gfa results/test_correct_polish.gfa \ + --verbose \ + --threads 4 \ + --correct \ + --polish + +build_splice_graph \ + --input-fasta data/transcript.fa \ + --input-bloom results/test_k27_m500M_l1.bloom \ + --kmer-size 27 \ + --max-fp-bases 5 \ + --output-gfa results/test_correct_polish_collapse.gfa \ + --verbose \ + --threads 4 \ + --correct \ + --polish \ + --collapse + + +gfa1_to_exons \ + --input-gfa results/test.gfa \ + --output-fasta results/test_exons.fa \ + --soft-mask-overlaps \ + --verbose + +gfa1_to_gapped_transcripts \ + --input-gfa results/test.gfa \ + --output-fasta results/test_gapped.fa \ + --hard-mask-overlaps \ + --verbose From 642614e65079745af635790a32790fddee9f9701 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 28 Feb 2018 17:06:11 +0100 Subject: [PATCH 03/45] Bugfix: SimpleFastaParser uses the entire header as identifier! --- exfi/io/fasta_to_dict.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/exfi/io/fasta_to_dict.py b/exfi/io/fasta_to_dict.py index eb6f719..727b147 100644 --- a/exfi/io/fasta_to_dict.py +++ b/exfi/io/fasta_to_dict.py @@ -14,4 +14,7 @@ def fasta_to_dict(filename: str) -> FastaDict: """ with open(filename, "r") as handle: - return FastaDict(SimpleFastaParser(handle)) + return FastaDict( + (identifier.split(" ")[0], sequence) + for identifier, sequence in SimpleFastaParser(handle) + ) From 6c40440a858f924fec1791d643b9ba04138c7cbd Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 28 Feb 2018 17:09:15 +0100 Subject: [PATCH 04/45] Version++ --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index abe2d27..fa9b993 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.6' +__version__ = '1.4.7' From 33f2c718a9c631abb23403d44e6e271f63ab1d4b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 1 Mar 2018 10:31:01 +0100 Subject: [PATCH 05/45] Even better handling of fasta files. Expanded tests so it doesn't happen again --- exfi/io/fasta_to_dict.py | 2 +- tests/io/spaced.fa | 51 +++++++++++++++++++++++++++++ tests/io/tabbed.fa | 51 +++++++++++++++++++++++++++++ tests/test_io/test_fasta_to_dict.py | 13 ++++++++ 4 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 tests/io/spaced.fa create mode 100644 tests/io/tabbed.fa diff --git a/exfi/io/fasta_to_dict.py b/exfi/io/fasta_to_dict.py index 727b147..3854218 100644 --- a/exfi/io/fasta_to_dict.py +++ b/exfi/io/fasta_to_dict.py @@ -15,6 +15,6 @@ def fasta_to_dict(filename: str) -> FastaDict: """ with open(filename, "r") as handle: return FastaDict( - (identifier.split(" ")[0], sequence) + (identifier.split()[0], sequence) for identifier, sequence in SimpleFastaParser(handle) ) diff --git a/tests/io/spaced.fa b/tests/io/spaced.fa new file mode 100644 index 0000000..e81e2a8 --- /dev/null +++ b/tests/io/spaced.fa @@ -0,0 +1,51 @@ +>ENSDART00000161035.1:0-326 description description +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA +CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA +GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT +CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG +TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG +ATGTGGAAATTCCTCCGCCACGAGCA +>ENSDART00000161035.1:397-472 description description +AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGT +CAGTCCAAATCAACA +>ENSDART00000161035.1:477-523 description description +AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +>ENSDART00000165342.1:5-127 description description +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA +GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA +AG +>ENSDART00000165342.1:125-304 description description +AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGG +CAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACC +CCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +>ENSDART00000165342.1:317-460 description description +GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGA +GGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGG +ACAAGGAGACAAGAAGTTACCAG +>ENSDART00000165342.1:459-592 description description +GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGC +TCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTT +CTGGGGTTCTTAA +>ENSDART00000165342.1:591-650 description description +AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT +>ENSDART00000165342.1:645-746 description description +GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACC +ATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +>ENSDART00000165342.1:746-851 description description +GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGC +CTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +>ENSDART00000165342.1:854-886 description description +TGCAGCCAAACAATGCAACTGTGACAGCAGCA +>ENSDART00000165342.1:899-953 description description +TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA +>ENSDART00000165342.1:974-1097 description description +CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA +GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGC +TGA +>ENSDART00000165342.1:1098-1175 description description +TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTG +ACGCTCACATATCCTGA +>ENSDART00000165342.1:1176-1324 description description +CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGAC +CCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCT +GCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/tabbed.fa b/tests/io/tabbed.fa new file mode 100644 index 0000000..d6bce0d --- /dev/null +++ b/tests/io/tabbed.fa @@ -0,0 +1,51 @@ +>ENSDART00000161035.1:0-326 description description +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA +CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA +GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT +CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG +TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG +ATGTGGAAATTCCTCCGCCACGAGCA +>ENSDART00000161035.1:397-472 description description +AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGT +CAGTCCAAATCAACA +>ENSDART00000161035.1:477-523 description description +AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +>ENSDART00000165342.1:5-127 description +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA +GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA +AG +>ENSDART00000165342.1:125-304 description description +AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGG +CAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACC +CCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +>ENSDART00000165342.1:317-460 description description +GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGA +GGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGG +ACAAGGAGACAAGAAGTTACCAG +>ENSDART00000165342.1:459-592 description description +GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGC +TCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTT +CTGGGGTTCTTAA +>ENSDART00000165342.1:591-650 description description +AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT +>ENSDART00000165342.1:645-746 description description +GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACC +ATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +>ENSDART00000165342.1:746-851 description description +GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGC +CTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +>ENSDART00000165342.1:854-886 description description +TGCAGCCAAACAATGCAACTGTGACAGCAGCA +>ENSDART00000165342.1:899-953 description description +TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA +>ENSDART00000165342.1:974-1097 description description +CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA +GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGC +TGA +>ENSDART00000165342.1:1098-1175 description description +TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTG +ACGCTCACATATCCTGA +>ENSDART00000165342.1:1176-1324 description description +CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGAC +CCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCT +GCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/test_io/test_fasta_to_dict.py b/tests/test_io/test_fasta_to_dict.py index 8510733..fdd4c26 100644 --- a/tests/test_io/test_fasta_to_dict.py +++ b/tests/test_io/test_fasta_to_dict.py @@ -17,6 +17,8 @@ EXONS_EMPTY_FN = "tests/io/exons_empty.fa" EXONS_SIMPLE_FN = "tests/io/exons_simple.fa" EXONS_COMPLEX_FN = "tests/io/exons_complex.fa" +EXONS_SPACED_FN = "tests/io/spaced.fa" +EXONS_TABBED_FN = "tests/io/tabbed.fa" EXONS_EMPTY_DICT = FastaDict() EXONS_SIMPLE_DICT = FastaDict({ @@ -92,6 +94,17 @@ def test_complex(self): expected = EXONS_COMPLEX_DICT self.assertEqualDict(actual, expected) + def test_spaced(self): + """exfi.io.fasta_to_dict: process a fasta with spaces in the header""" + actual = fasta_to_dict(EXONS_SPACED_FN) + expected = EXONS_COMPLEX_DICT + self.assertEqualDict(actual, expected) + + def test_tabbed(self): + """exfi.io.fasta_to_dict: process a fasta with tabs in the header""" + actual = fasta_to_dict(EXONS_TABBED_FN) + expected = EXONS_COMPLEX_DICT + self.assertEqualDict(actual, expected) if __name__ == '__main__': main() From f481c5adefc8f45f05e0bb4aaa099c049429a7f0 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 1 Mar 2018 10:34:53 +0100 Subject: [PATCH 06/45] Version++ --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index fa9b993..52875c8 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.7' +__version__ = '1.4.8' From b44d62cf781da54811567f62e3201435aa976f86 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 2 Mar 2018 18:09:30 +0100 Subject: [PATCH 07/45] Threading (#43) * pathos.multiprocessing -> pathos.ThreadingPool * typo. Wrong fasta file * Deleted comments --- exfi/build_splice_graph_dict.py | 10 +++++---- exfi/correct.py | 15 +++++++------ exfi/polish.py | 37 ++++++++------------------------- scripts/pipeline_chr25.sh | 10 ++++----- 4 files changed, 27 insertions(+), 45 deletions(-) diff --git a/exfi/build_splice_graph_dict.py b/exfi/build_splice_graph_dict.py index b29cd42..acda6f8 100644 --- a/exfi/build_splice_graph_dict.py +++ b/exfi/build_splice_graph_dict.py @@ -9,7 +9,7 @@ import networkx as nx import pandas as pd -import pathos.multiprocessing as mp +from pathos.threading import ThreadPool from exfi.classes import Node2Coordinates, Edge2Overlap, Coordinate, SpliceGraph, SpliceGraphDict, \ Path2Nodes @@ -191,16 +191,18 @@ def build_splice_graph_dict( bed6df_dict = bed3_records_to_bed6df_dict(bed3records) # Initialize pool of workers - pool = mp.Pool(args["threads"]) + pool = ThreadPool(args["threads"]) # Build graphs in parallel and merge results splice_graphs = pool.map( - func=build_splice_graph, - iterable=bed6df_dict.values(), + build_splice_graph, + bed6df_dict.values(), chunksize=1000 ) pool.close() pool.join() + pool.restart() + splice_graph_dict = SpliceGraphDict(zip(bed6df_dict.keys(), splice_graphs)) return splice_graph_dict diff --git a/exfi/correct.py b/exfi/correct.py index 572c835..6855d78 100644 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -19,7 +19,7 @@ import networkx as nx -import pathos.multiprocessing as mp +from pathos.threading import ThreadPool from Bio import \ SeqIO, \ @@ -412,20 +412,19 @@ def correct_splice_graph_dict(splice_graph_dict: SpliceGraphDict, args: dict) -> filled_edges_by_transcript[transcript] = set() # Initialize pool of workers - pool = mp.Pool(args["threads"]) + pool = ThreadPool(args["threads"]) # Process each graph in parallel logging.info("\tCorrecting each splice graph") - corrected_splice_graphs = pool.starmap( - func=_sculpt_graph, - iterable=zip( - splice_graph_dict.values(), - filled_edges_by_transcript.values() - ), + corrected_splice_graphs = pool.map( + _sculpt_graph, + splice_graph_dict.values(), + filled_edges_by_transcript.values(), chunksize=1000 ) pool.close() pool.join() + pool.restart() splice_graph_dict = SpliceGraphDict( zip(splice_graph_dict.keys(), corrected_splice_graphs) ) diff --git a/exfi/polish.py b/exfi/polish.py index d708e43..c39f408 100644 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -6,12 +6,15 @@ import networkx as nx +from pathos.threading import ThreadPool + from exfi.io import \ _coordinate_to_str from exfi.classes import Coordinate, SpliceGraph, SpliceGraphDict, FastaDict + def trim_end(coordinate: Coordinate, bases: int) -> Coordinate: """Trim bases to the end of the coordinate @@ -148,40 +151,18 @@ def polish_splice_graph_dict( args must at least be {"threads": 1} """ - import pathos.multiprocessing as mp - # Initialize pool of workers - pool = mp.Pool(args["threads"]) - - # def polish_wrapper(splice_graph: SpliceGraph) -> SpliceGraph: - # """Export fasta_dict to function. - # - # :param nx.DiGraph splice_graph: splice_graph to polish. - # """ - # return polish_splice_graph(splice_graph, fasta_dict) - - # # Run - # results = pool.starmap( - # func=polish_wrapper, - # iterable=splice_graph_dict.values(), - # chunksize=1000 - # ) - # pool.close() - # pool.join() - - # Run - sg_fasta_pairs = ( - (splice_graph, {transcript_id: fasta_dict[transcript_id]}) - for transcript_id, splice_graph in splice_graph_dict.items() - ) + pool = ThreadPool(args["threads"]) - results = pool.starmap( - func=polish_splice_graph, - iterable=sg_fasta_pairs, + results = pool.map( + polish_splice_graph, + splice_graph_dict.values(), + ({transcript_id: fasta_dict[transcript_id]} for transcript_id in splice_graph_dict.keys()), chunksize=1000 ) pool.close() pool.join() + pool.restart() # Add results to splice_graph_dict for i, transcript in enumerate(splice_graph_dict.keys()): diff --git a/scripts/pipeline_chr25.sh b/scripts/pipeline_chr25.sh index 73ded4b..8db1095 100644 --- a/scripts/pipeline_chr25.sh +++ b/scripts/pipeline_chr25.sh @@ -33,7 +33,7 @@ build_baited_bloom_filter \ fi build_splice_graph \ - --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ --kmer $k \ --max-fp-bases 3 \ @@ -42,7 +42,7 @@ build_splice_graph \ --threads 4 build_splice_graph \ - --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ --kmer $k \ --max-fp-bases 3 \ @@ -52,7 +52,7 @@ build_splice_graph \ --threads 4 build_splice_graph \ - --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ --kmer $k \ --max-fp-bases 3 \ @@ -63,7 +63,7 @@ build_splice_graph \ --threads 4 build_splice_graph \ - --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ --input-bloom results/drer25sim_k${k}_m${m}_l${l}.bloom \ --kmer $k \ --max-fp-bases 3 \ @@ -75,7 +75,7 @@ build_splice_graph \ --threads 4 build_splice_graph \ - --input-fasta data/Danio_rerio.GRCz10.cdna.all.fa \ + --input-fasta data/Danio_rerio.GRCz10.cdna.25.fa \ --input-bloom results/drer25real_k${k}_m${m}_l1.bloom \ --kmer $k \ --max-fp-bases 3 \ From a63f34734c5d03d1cc00c63dbb3a8c201516e2c7 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 2 Mar 2018 18:19:42 +0100 Subject: [PATCH 08/45] Added done message --- bin/build_baited_bloom_filter | 2 ++ bin/build_splice_graph | 2 ++ bin/gfa1_to_exons | 2 ++ bin/gfa1_to_gapped_transcripts | 2 ++ 4 files changed, 8 insertions(+) diff --git a/bin/build_baited_bloom_filter b/bin/build_baited_bloom_filter index 0558b01..dadb5cd 100755 --- a/bin/build_baited_bloom_filter +++ b/bin/build_baited_bloom_filter @@ -141,3 +141,5 @@ if __name__ == '__main__': # Run the program build_baited_bloom_filter(args) + + logging.info("Done!") diff --git a/bin/build_splice_graph b/bin/build_splice_graph index ae3d016..efe9da1 100755 --- a/bin/build_splice_graph +++ b/bin/build_splice_graph @@ -211,3 +211,5 @@ if __name__ == "__main__": transcriptome_dict=transcriptome_dict, filename=args["gfa1"] ) + + logging.info("Done!") diff --git a/bin/gfa1_to_exons b/bin/gfa1_to_exons index 9894ba7..4b66f2a 100755 --- a/bin/gfa1_to_exons +++ b/bin/gfa1_to_exons @@ -103,3 +103,5 @@ if __name__ == "__main__": fasta_out_fn=args["fasta"], masking=masking ) + + logging.info("Done!") diff --git a/bin/gfa1_to_gapped_transcripts b/bin/gfa1_to_gapped_transcripts index 1405a65..aeba470 100755 --- a/bin/gfa1_to_gapped_transcripts +++ b/bin/gfa1_to_gapped_transcripts @@ -117,3 +117,5 @@ if __name__ == "__main__": number_of_ns=args["number_of_ns"], masking=masking ) + + logging.info("Done!") From d8920862a7e31ba32c3039dcb31fa09117940b8d Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 2 Mar 2018 18:21:21 +0100 Subject: [PATCH 09/45] version++ --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index 769b15e..1fa4c31 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.8' \ No newline at end of file +__version__ = '1.4.9' From 7d198e135347d1470fd6367a806fa49a06a9be55 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 5 Mar 2018 17:13:16 +0100 Subject: [PATCH 10/45] Going back from pathos.ThreadPool to pathos.multiprocessing --- exfi/build_splice_graph_dict.py | 10 ++++----- exfi/correct.py | 15 +++++++------- exfi/polish.py | 36 +++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/exfi/build_splice_graph_dict.py b/exfi/build_splice_graph_dict.py index acda6f8..b29cd42 100644 --- a/exfi/build_splice_graph_dict.py +++ b/exfi/build_splice_graph_dict.py @@ -9,7 +9,7 @@ import networkx as nx import pandas as pd -from pathos.threading import ThreadPool +import pathos.multiprocessing as mp from exfi.classes import Node2Coordinates, Edge2Overlap, Coordinate, SpliceGraph, SpliceGraphDict, \ Path2Nodes @@ -191,18 +191,16 @@ def build_splice_graph_dict( bed6df_dict = bed3_records_to_bed6df_dict(bed3records) # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) # Build graphs in parallel and merge results splice_graphs = pool.map( - build_splice_graph, - bed6df_dict.values(), + func=build_splice_graph, + iterable=bed6df_dict.values(), chunksize=1000 ) pool.close() pool.join() - pool.restart() - splice_graph_dict = SpliceGraphDict(zip(bed6df_dict.keys(), splice_graphs)) return splice_graph_dict diff --git a/exfi/correct.py b/exfi/correct.py index 6855d78..572c835 100644 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -19,7 +19,7 @@ import networkx as nx -from pathos.threading import ThreadPool +import pathos.multiprocessing as mp from Bio import \ SeqIO, \ @@ -412,19 +412,20 @@ def correct_splice_graph_dict(splice_graph_dict: SpliceGraphDict, args: dict) -> filled_edges_by_transcript[transcript] = set() # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) # Process each graph in parallel logging.info("\tCorrecting each splice graph") - corrected_splice_graphs = pool.map( - _sculpt_graph, - splice_graph_dict.values(), - filled_edges_by_transcript.values(), + corrected_splice_graphs = pool.starmap( + func=_sculpt_graph, + iterable=zip( + splice_graph_dict.values(), + filled_edges_by_transcript.values() + ), chunksize=1000 ) pool.close() pool.join() - pool.restart() splice_graph_dict = SpliceGraphDict( zip(splice_graph_dict.keys(), corrected_splice_graphs) ) diff --git a/exfi/polish.py b/exfi/polish.py index c39f408..8159a2d 100644 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -6,7 +6,8 @@ import networkx as nx -from pathos.threading import ThreadPool +import pathos.multiprocessing as mp + from exfi.io import \ _coordinate_to_str @@ -14,7 +15,6 @@ from exfi.classes import Coordinate, SpliceGraph, SpliceGraphDict, FastaDict - def trim_end(coordinate: Coordinate, bases: int) -> Coordinate: """Trim bases to the end of the coordinate @@ -152,17 +152,37 @@ def polish_splice_graph_dict( """ # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) + + # def polish_wrapper(splice_graph: SpliceGraph) -> SpliceGraph: + # """Export fasta_dict to function. + # + # :param nx.DiGraph splice_graph: splice_graph to polish. + # """ + # return polish_splice_graph(splice_graph, fasta_dict) + + # # Run + # results = pool.starmap( + # func=polish_wrapper, + # iterable=splice_graph_dict.values(), + # chunksize=1000 + # ) + # pool.close() + # pool.join() + + # Run + sg_fasta_pairs = ( + (splice_graph, {transcript_id: fasta_dict[transcript_id]}) + for transcript_id, splice_graph in splice_graph_dict.items() + ) - results = pool.map( - polish_splice_graph, - splice_graph_dict.values(), - ({transcript_id: fasta_dict[transcript_id]} for transcript_id in splice_graph_dict.keys()), + results = pool.starmap( + func=polish_splice_graph, + iterable=sg_fasta_pairs, chunksize=1000 ) pool.close() pool.join() - pool.restart() # Add results to splice_graph_dict for i, transcript in enumerate(splice_graph_dict.keys()): From 53a140e828e46ec51993f3ff4e3e61d04cc855ac Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 5 Mar 2018 18:04:22 +0100 Subject: [PATCH 11/45] --debug: Improved logging and loggers --- bin/build_baited_bloom_filter | 16 +++++++++++++--- bin/build_splice_graph | 15 +++++++++++---- bin/gfa1_to_exons | 14 +++++++++++--- bin/gfa1_to_gapped_transcripts | 15 ++++++++++++--- exfi/io/splice_graph_dict_to_gfa1.py | 5 ++++- 5 files changed, 51 insertions(+), 14 deletions(-) diff --git a/bin/build_baited_bloom_filter b/bin/build_baited_bloom_filter index dadb5cd..354624c 100755 --- a/bin/build_baited_bloom_filter +++ b/bin/build_baited_bloom_filter @@ -105,6 +105,13 @@ parser.add_argument( help="Increase output verbosity" ) +parser.add_argument( + "-d", "--debug", + action="store_true", + dest="debug", + help="Log everything!" +) + if __name__ == '__main__': # Store arguments @@ -114,10 +121,13 @@ if __name__ == '__main__': args["threads"] = int(args["threads"]) # I don't know why it is parsed as tuple # Set up logger + logger = logging.getLogger() + logging.basicConfig(format='%(asctime)s\t%(module)s\t%(message)s', level=logging.ERROR) if args["verbose"]: - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - else: - logging.basicConfig(level=logging.CRITICAL, format='%(asctime)s %(message)s') + logger.setLevel(logging.INFO) + if args["debug"]: + logger.setLevel(logging.DEBUG) + # Check inputs assert isfile(args["fasta"]), 'ERROR: input fasta does not exist' diff --git a/bin/build_splice_graph b/bin/build_splice_graph index efe9da1..2c1be2e 100755 --- a/bin/build_splice_graph +++ b/bin/build_splice_graph @@ -165,6 +165,12 @@ parser.add_argument( help="Polish overlaps in which a AG-GT signal is detected" ) +parser.add_argument( + "-d", "--debug", + action="store_true", + dest="debug", + help="Log everything!" +) args = parser.parse_args() @@ -176,11 +182,12 @@ if __name__ == "__main__": args["fasta"] = abspath(args["fasta"]) args["gfa1"] = abspath(args["gfa1"]) - # Set up logger + logger = logging.getLogger() + logging.basicConfig(format='%(asctime)s\t%(module)s\t%(message)s', level=logging.ERROR) if args["verbose"]: - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - else: - logging.basicConfig(level=logging.CRITICAL, format='%(asctime)s %(message)s') + logger.setLevel(logging.INFO) + if args["debug"]: + logger.setLevel(logging.DEBUG) # Get predicted exons in bed format positive_exons_bed = _find_exons_pipeline(args) diff --git a/bin/gfa1_to_exons b/bin/gfa1_to_exons index 4b66f2a..20d288f 100755 --- a/bin/gfa1_to_exons +++ b/bin/gfa1_to_exons @@ -76,6 +76,12 @@ parser.add_argument( help="Increase output verbosity" ) +parser.add_argument( + "-d", "--debug", + action="store_true", + dest="debug", + help="Log everything!" +) args = parser.parse_args() @@ -87,10 +93,12 @@ if __name__ == "__main__": args["fasta"] = abspath(args["fasta"]) # Set up logger + logger = logging.getLogger() + logging.basicConfig(format='%(asctime)s\t%(module)s\t%(message)s', level=logging.ERROR) if args["verbose"]: - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - else: - logging.basicConfig(level=logging.CRITICAL, format='%(asctime)s %(message)s') + logger.setLevel(logging.INFO) + if args["debug"]: + logger.setLevel(logging.DEBUG) masking = "none" if args["soft_mask_overlaps"] == True: diff --git a/bin/gfa1_to_gapped_transcripts b/bin/gfa1_to_gapped_transcripts index aeba470..a36a5f0 100755 --- a/bin/gfa1_to_gapped_transcripts +++ b/bin/gfa1_to_gapped_transcripts @@ -91,6 +91,13 @@ parser.add_argument( help="Increase output verbosity" ) +parser.add_argument( + "-d", "--debug", + action="store_true", + dest="debug", + help="Log everything!" +) + if __name__ == "__main__": @@ -100,10 +107,12 @@ if __name__ == "__main__": args["fasta"] = abspath(args["fasta"]) # Set up logger + logger = logging.getLogger() + logging.basicConfig(format='%(asctime)s\t%(module)s\t%(message)s', level=logging.ERROR) if args["verbose"]: - logging.basicConfig(level=logging.INFO, format='%(asctime)s\t%(message)s') - else: - logging.basicConfig(level=logging.CRITICAL, format='%(asctime)s %(message)s') + logger.setLevel(logging.INFO) + if args["debug"]: + logger.setLevel(logging.DEBUG) masking = "none" if args["soft_mask_overlaps"] == True: diff --git a/exfi/io/splice_graph_dict_to_gfa1.py b/exfi/io/splice_graph_dict_to_gfa1.py index 9fe62ae..060ed58 100644 --- a/exfi/io/splice_graph_dict_to_gfa1.py +++ b/exfi/io/splice_graph_dict_to_gfa1.py @@ -117,7 +117,10 @@ def _compute_containments(splice_graph_dict: SpliceGraphDict) -> Generator[str, for node, coordinates in natsorted(node2coordinates.items()): for (transcript_id, start, end) in coordinates: - logging.debug("\t\tProcessing %s - %s:%s-%s", node, transcript_id, start, end) + logging.debug( + "\t\tProcessing node %s with coordinates %s:%s-%s", + node, transcript_id, start, end + ) cigar = str(int(end) - int(start)) + "M" yield f"C\t{transcript_id}\t+\t{node}\t+\t{start}\t{cigar}\n" From a0353adaa44f15d86ff5d3c852c5483fbec00e01 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 5 Mar 2018 18:06:33 +0100 Subject: [PATCH 12/45] version ++ 1.4.10 --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index 1fa4c31..4f070c0 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.9' +__version__ = '1.4.10' From fce84fdf9f880b763f8940fc8fb3dc29def7dab6 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 5 Mar 2018 18:29:10 +0100 Subject: [PATCH 13/45] Bugfix: full transition back to pathos.mp --- exfi/__init__.py | 2 +- exfi/build_splice_graph_dict.py | 5 ++--- exfi/correct.py | 24 ++++++------------------ exfi/polish.py | 13 ++++++++----- scripts/pipeline_chr25.sh | 9 ++++++--- 5 files changed, 23 insertions(+), 30 deletions(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index 6ec3ebc..4f070c0 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.10' \ No newline at end of file +__version__ = '1.4.10' diff --git a/exfi/build_splice_graph_dict.py b/exfi/build_splice_graph_dict.py index acda6f8..a020acc 100644 --- a/exfi/build_splice_graph_dict.py +++ b/exfi/build_splice_graph_dict.py @@ -9,7 +9,7 @@ import networkx as nx import pandas as pd -from pathos.threading import ThreadPool +import pathos.multiprocessing as mp from exfi.classes import Node2Coordinates, Edge2Overlap, Coordinate, SpliceGraph, SpliceGraphDict, \ Path2Nodes @@ -191,7 +191,7 @@ def build_splice_graph_dict( bed6df_dict = bed3_records_to_bed6df_dict(bed3records) # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) # Build graphs in parallel and merge results splice_graphs = pool.map( @@ -201,7 +201,6 @@ def build_splice_graph_dict( ) pool.close() pool.join() - pool.restart() splice_graph_dict = SpliceGraphDict(zip(bed6df_dict.keys(), splice_graphs)) diff --git a/exfi/correct.py b/exfi/correct.py index 6855d78..5b92696 100644 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -19,7 +19,7 @@ import networkx as nx -from pathos.threading import ThreadPool +import pathos.multiprocessing as mp from Bio import \ SeqIO, \ @@ -222,18 +222,6 @@ def _rename_nodes_from_collapse(quotient_graph: SpliceGraph) -> Dict[str, str]: mapping[key] = value[0] return mapping - # # Original dict - # mapping_raw: Dict[str, Tuple[str, ...]] = { # Old -> New - # node_id: tuple(natsorted(node for node in node_id)) - # for node_id in quotient_graph.nodes() - # } - - # # New dict - # mapping: Dict[str, str] = {} - # for key, value in mapping_raw.items(): - # if len(value) == 1: - # mapping[key] = value[0] - # return mapping def _recompute_node2coord( @@ -412,19 +400,19 @@ def correct_splice_graph_dict(splice_graph_dict: SpliceGraphDict, args: dict) -> filled_edges_by_transcript[transcript] = set() # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) # Process each graph in parallel logging.info("\tCorrecting each splice graph") - corrected_splice_graphs = pool.map( + corrected_splice_graphs = pool.starmap( _sculpt_graph, - splice_graph_dict.values(), - filled_edges_by_transcript.values(), + zip(splice_graph_dict.values(), filled_edges_by_transcript.values()), chunksize=1000 ) pool.close() pool.join() - pool.restart() + + splice_graph_dict = SpliceGraphDict( zip(splice_graph_dict.keys(), corrected_splice_graphs) ) diff --git a/exfi/polish.py b/exfi/polish.py index 26d06a1..4c65dbe 100644 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -153,17 +153,20 @@ def polish_splice_graph_dict( """ # Initialize pool of workers - pool = ThreadPool(args["threads"]) + pool = mp.Pool(args["threads"]) - results = pool.map( + splice_graphs = (splice_graph for splice_graph in splice_graph_dict.values()) + fasta_dicts = ( + {transcript_id: fasta_dict[transcript_id]} for transcript_id in splice_graph_dict.keys() + ) + + results = pool.starmap( polish_splice_graph, - splice_graph_dict.values(), - ({transcript_id: fasta_dict[transcript_id]} for transcript_id in splice_graph_dict.keys()), + zip(splice_graphs, fasta_dicts), chunksize=1000 ) pool.close() pool.join() - pool.restart() # Add results to splice_graph_dict for i, transcript in enumerate(splice_graph_dict.keys()): diff --git a/scripts/pipeline_chr25.sh b/scripts/pipeline_chr25.sh index 8db1095..976983d 100644 --- a/scripts/pipeline_chr25.sh +++ b/scripts/pipeline_chr25.sh @@ -82,16 +82,19 @@ build_splice_graph \ --correct \ --output-gfa results/drer25real.gfa \ --verbose \ - --threads 4 + --threads 4 \ + --debug gfa1_to_exons \ --input-gfa results/drer25sim.gfa \ --output-fasta results/drer25sim_exons.fa \ - --soft-mask-overlaps + --soft-mask-overlaps \ + --debug gfa1_to_exons \ --input-gfa results/drer25real.gfa \ --output-fasta results/drer25real_exons.fa \ - --soft-mask-overlaps + --soft-mask-overlaps \ + --debug bash src/pr_chr25.sh From 0f2850b31ba2cda11787b0e156f16c1d3f882e28 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Sat, 10 Mar 2018 16:04:16 +0100 Subject: [PATCH 14/45] Removed asserts from input files: now we can use /dev/fd, stdin, stdout, devnull... --- bin/build_baited_bloom_filter | 3 --- 1 file changed, 3 deletions(-) diff --git a/bin/build_baited_bloom_filter b/bin/build_baited_bloom_filter index 354624c..86959b3 100755 --- a/bin/build_baited_bloom_filter +++ b/bin/build_baited_bloom_filter @@ -130,13 +130,10 @@ if __name__ == '__main__': # Check inputs - assert isfile(args["fasta"]), 'ERROR: input fasta does not exist' assert args["kmer"] >= 1, 'ERROR: incorrect kmer size' # assert bloom_size assert args["levels"] >= 1, 'ERROR: incorrect number of levels' assert args["threads"] >= 1, 'ERROR: incorrect number of threads' - for read_file in args["reads"]: - assert isfile(read_file), f'ERROR: file {read_file} does not exist' # Check if programs are in path from shutil import which From c8cc83503926f0252706f2162e28dd8bd80bd342 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Sat, 10 Mar 2018 16:07:49 +0100 Subject: [PATCH 15/45] v 1.4.11 --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index 4f070c0..7579919 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.10' +__version__ = '1.4.11' From 5176a7347fe3ab724bc659a82d0e099afb9ab2e4 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 28 May 2018 18:05:12 +0200 Subject: [PATCH 16/45] Fixed memory leak when multiprocessing - Kill each child when they end their task: maxtasksperchild=1 --- exfi/build_splice_graph_dict.py | 4 ++-- exfi/correct.py | 4 ++-- exfi/polish.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/exfi/build_splice_graph_dict.py b/exfi/build_splice_graph_dict.py index a020acc..ebdf520 100644 --- a/exfi/build_splice_graph_dict.py +++ b/exfi/build_splice_graph_dict.py @@ -191,13 +191,13 @@ def build_splice_graph_dict( bed6df_dict = bed3_records_to_bed6df_dict(bed3records) # Initialize pool of workers - pool = mp.Pool(args["threads"]) + pool = mp.Pool(args["threads"], maxtasksperchild=1) # Build graphs in parallel and merge results splice_graphs = pool.map( build_splice_graph, bed6df_dict.values(), - chunksize=1000 + chunksize=1000 # Number of Transcripts to process at one ) pool.close() pool.join() diff --git a/exfi/correct.py b/exfi/correct.py index 5b92696..0506911 100644 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -400,14 +400,14 @@ def correct_splice_graph_dict(splice_graph_dict: SpliceGraphDict, args: dict) -> filled_edges_by_transcript[transcript] = set() # Initialize pool of workers - pool = mp.Pool(args["threads"]) + pool = mp.Pool(args["threads"], maxtasksperchild=1) # Process each graph in parallel logging.info("\tCorrecting each splice graph") corrected_splice_graphs = pool.starmap( _sculpt_graph, zip(splice_graph_dict.values(), filled_edges_by_transcript.values()), - chunksize=1000 + chunksize=1000 # Number of splice graphs to process at once. ) pool.close() pool.join() diff --git a/exfi/polish.py b/exfi/polish.py index 4c65dbe..7aef2ac 100644 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -153,7 +153,7 @@ def polish_splice_graph_dict( """ # Initialize pool of workers - pool = mp.Pool(args["threads"]) + pool = mp.Pool(args["threads"], maxtasksperchild=1) splice_graphs = (splice_graph for splice_graph in splice_graph_dict.values()) fasta_dicts = ( @@ -163,7 +163,7 @@ def polish_splice_graph_dict( results = pool.starmap( polish_splice_graph, zip(splice_graphs, fasta_dicts), - chunksize=1000 + chunksize=1000 # Number of splice_graphs to process at once ) pool.close() pool.join() From 07e147b73f9af7d5cd45d757bf962a6b30f45135 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 28 May 2018 18:10:14 +0200 Subject: [PATCH 17/45] Version 1.4.12 --- README.md | 2 +- exfi/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c15cdc8..859684e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Get exons from a transcriptome and raw genomic reads using abyss-bloom and bedto ## Requirements ``` -abyss>=2.0.0 +abyss==2.0.1 (something is happening with 2.0.2 and abyss-bloom kmers) bedtools (tested on 2.0) python3 biopython diff --git a/exfi/__init__.py b/exfi/__init__.py index 7579919..44bea25 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.11' +__version__ = '1.4.12' From 92fe53155e47bd2d482f7bde13a826c71f5edab7 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 14 Jan 2019 14:36:47 +0100 Subject: [PATCH 18/45] pd.DataFrame as central structure. Missing tests --- bin/build_splice_graph | 74 ++--- bin/gfa1_to_exons | 21 +- bin/gfa1_to_gapped_transcripts | 18 +- exfi/__init__.py | 2 +- exfi/build_splice_graph_dict.py | 207 ------------ exfi/classes.py | 212 ------------- exfi/collapse.py | 154 --------- exfi/correct.py | 436 +++++--------------------- exfi/find_exons.py | 38 +-- exfi/io/__init__.py | 34 +- exfi/io/bed.py | 78 +++++ exfi/io/bed4_to_gfa1.py | 89 ++++++ exfi/io/components.py | 120 ------- exfi/io/fasta_to_dict.py | 10 +- exfi/io/gfa1_to_bed.py | 33 ++ exfi/io/gfa1_to_exons.py | 57 ---- exfi/io/gfa1_to_fasta.py | 69 ++++ exfi/io/gfa1_to_gapped_transcripts.py | 63 ---- exfi/io/gfa1_to_splice_graph_dict.py | 112 ------- exfi/io/masking.py | 14 +- exfi/io/read_bed3.py | 18 ++ exfi/io/read_gfa1.py | 166 ---------- exfi/io/splice_graph_dict_to_gfa1.py | 172 ---------- exfi/polish.py | 222 +++++-------- scripts/pre-commit | 37 +++ setup.py | 2 - tests/auxiliary_functions.py | 29 +- tests/custom_assertions.py | 12 +- tests/io/complex.bed | 15 + tests/io/empty.bed | 0 tests/io/simple.bed | 1 + tests/test_collapse.py | 217 ------------- tests/test_find_exons.py | 62 +++- tests/test_io/test_fasta_to_dict.py | 11 +- 34 files changed, 672 insertions(+), 2133 deletions(-) delete mode 100644 exfi/build_splice_graph_dict.py delete mode 100644 exfi/classes.py delete mode 100644 exfi/collapse.py mode change 100644 => 100755 exfi/correct.py create mode 100644 exfi/io/bed.py create mode 100644 exfi/io/bed4_to_gfa1.py delete mode 100644 exfi/io/components.py create mode 100644 exfi/io/gfa1_to_bed.py delete mode 100644 exfi/io/gfa1_to_exons.py create mode 100644 exfi/io/gfa1_to_fasta.py delete mode 100644 exfi/io/gfa1_to_gapped_transcripts.py delete mode 100644 exfi/io/gfa1_to_splice_graph_dict.py create mode 100644 exfi/io/read_bed3.py delete mode 100644 exfi/io/read_gfa1.py delete mode 100644 exfi/io/splice_graph_dict_to_gfa1.py mode change 100644 => 100755 exfi/polish.py create mode 100755 scripts/pre-commit create mode 100644 tests/io/complex.bed create mode 100644 tests/io/empty.bed create mode 100644 tests/io/simple.bed delete mode 100644 tests/test_collapse.py diff --git a/bin/build_splice_graph b/bin/build_splice_graph index 2c1be2e..abd8398 100755 --- a/bin/build_splice_graph +++ b/bin/build_splice_graph @@ -12,32 +12,16 @@ from Bio import SeqIO from exfi import __version__ -from exfi.find_exons import \ - _get_fasta, \ - _find_exons_pipeline - -from exfi.build_splice_graph_dict import \ - build_splice_graph_dict - -from exfi.correct import \ - correct_splice_graph_dict - -from exfi.polish import \ - polish_splice_graph_dict - -from exfi.io.fasta_to_dict import \ - fasta_to_dict - -from exfi.io.splice_graph_dict_to_gfa1 import \ - splice_graph_dict_to_gfa1 - -from exfi.collapse import \ - collapse_splice_graph_dict - - +from exfi.find_exons import _find_exons_pipeline +from exfi.io.bed import bed3_to_bed4 +from exfi.io.fasta_to_dict import fasta_to_dict +from exfi.polish import polish_bed4 +from exfi.correct import correct_bed4 +from exfi.io.bed4_to_gfa1 import bed4_to_gfa1 parser = argparse.ArgumentParser( - usage='build_splicegraph -i transcriptome.fa -b bloom_filter.bf -k 30 -o exome.gfa', + usage='build_splicegraph -i transcriptome.fa -b bloom_filter.bf -k 30 ' + '-o exome.gfa', description='Store the predicted exome in GFA format', epilog='Jorge Langa. Send issues and pull requests to github.com/jlanga/' 'exfi', @@ -137,7 +121,8 @@ parser.add_argument( parser.add_argument( '--correct', '-C', - help='Correct splice graph by using sealer between exons that seem nearby [False]', + help='Correct splice graph by using sealer between exons that seem nearby ' + '[False]', dest='correct', action="store_true" ) @@ -190,33 +175,36 @@ if __name__ == "__main__": logger.setLevel(logging.DEBUG) # Get predicted exons in bed format - positive_exons_bed = _find_exons_pipeline(args) - - # Build splice graph - splice_graph_dict = build_splice_graph_dict(positive_exons_bed, args) + bed3 = _find_exons_pipeline(args) + bed4 = bed3_to_bed4(bed3) # Transcriptome_dict transcriptome_dict = fasta_to_dict(args["fasta"]) if args["polish"]: - splice_graph_dict = polish_splice_graph_dict(splice_graph_dict, transcriptome_dict, args) + bed4 = polish_bed4(bed4=bed4, transcriptome_dict=transcriptome_dict) if args["correct"]: - splice_graph_dict = correct_splice_graph_dict(splice_graph_dict, args) - - if args["collapse"]: - splice_graph_dict = { - "collapsed": collapse_splice_graph_dict( - splice_graph_dict=splice_graph_dict, - transcriptome_dict=transcriptome_dict - ) - } + bed4 = correct_bed4( + bed4=bed4, transcriptome_dict=transcriptome_dict, args=args + ) + + # if args["collapse"]: + # splice_graph_dict = { + # "collapsed": collapse_splice_graph_dict( + # splice_graph_dict=splice_graph_dict, + # transcriptome_dict=transcriptome_dict + # ) + # } # Write to GFA1 - splice_graph_dict_to_gfa1( - splice_graph_dict=splice_graph_dict, - transcriptome_dict=transcriptome_dict, - filename=args["gfa1"] + bed4_to_gfa1( + gfa1_fn=args["gfa1"], bed4=bed4, transcriptome_dict=transcriptome_dict ) + # splice_graph_dict_to_gfa1( + # splice_graph_dict=splice_graph_dict, + # transcriptome_dict=transcriptome_dict, + # filename=args["gfa1"] + # ) logging.info("Done!") diff --git a/bin/gfa1_to_exons b/bin/gfa1_to_exons index 20d288f..a43b3c0 100755 --- a/bin/gfa1_to_exons +++ b/bin/gfa1_to_exons @@ -10,7 +10,7 @@ from os.path import \ from exfi import __version__ -from exfi.io.gfa1_to_exons import \ +from exfi.io.gfa1_to_fasta import \ gfa1_to_exons parser = argparse.ArgumentParser( @@ -34,7 +34,8 @@ parser.add_argument( '-i', type=str, required=True, - help='Input splice graph in GFA1 format (the results from build_splicegraph)', + help='Input splice graph in GFA1 format (the results from ' + 'build_splicegraph)', dest='gfa1', metavar='FILE' ) @@ -100,16 +101,16 @@ if __name__ == "__main__": if args["debug"]: logger.setLevel(logging.DEBUG) - masking = "none" - if args["soft_mask_overlaps"] == True: - masking = "soft" - if args["hard_mask_overlaps"] == True: - masking = "hard" + # masking = "none" + # if args["soft_mask_overlaps"] == True: + # masking = "soft" + # if args["hard_mask_overlaps"] == True: + # masking = "hard" gfa1_to_exons( - gfa_in_fn=args["gfa1"], - fasta_out_fn=args["fasta"], - masking=masking + fasta_out=args["fasta"], + gfa1_in=args["gfa1"], + # masking=masking ) logging.info("Done!") diff --git a/bin/gfa1_to_gapped_transcripts b/bin/gfa1_to_gapped_transcripts index a36a5f0..6298403 100755 --- a/bin/gfa1_to_gapped_transcripts +++ b/bin/gfa1_to_gapped_transcripts @@ -9,7 +9,7 @@ from os.path import \ abspath from exfi import __version__ -from exfi.io.gfa1_to_gapped_transcripts import \ +from exfi.io.gfa1_to_fasta import \ gfa1_to_gapped_transcripts @@ -114,17 +114,17 @@ if __name__ == "__main__": if args["debug"]: logger.setLevel(logging.DEBUG) - masking = "none" - if args["soft_mask_overlaps"] == True: - masking = "soft" - if args["hard_mask_overlaps"] == True: - masking = "hard" + # masking = "none" + # if args["soft_mask_overlaps"] == True: + # masking = "soft" + # if args["hard_mask_overlaps"] == True: + # masking = "hard" gfa1_to_gapped_transcripts( - gfa_in=args["gfa1"], fasta_out=args["fasta"], - number_of_ns=args["number_of_ns"], - masking=masking + gfa1_in=args["gfa1"], + gap_size=args["number_of_ns"], + # masking=masking ) logging.info("Done!") diff --git a/exfi/__init__.py b/exfi/__init__.py index 44bea25..9975e27 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.4.12' +__version__ = '1.5.0' diff --git a/exfi/build_splice_graph_dict.py b/exfi/build_splice_graph_dict.py deleted file mode 100644 index ebdf520..0000000 --- a/exfi/build_splice_graph_dict.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python3 - -"""Module to build the associated splice graph from a set of bed3 records""" - -from typing import \ - Iterable - -import logging - -import networkx as nx -import pandas as pd -import pathos.multiprocessing as mp - -from exfi.classes import Node2Coordinates, Edge2Overlap, Coordinate, SpliceGraph, SpliceGraphDict, \ - Path2Nodes - -def _bed3_to_str(bed3_record: Coordinate) -> str: - """Convert a three element tuple into a string of the form a[0]:a[1]-a[2] - - :param tuple bed3_record: tuple. Must have length 3 - """ - if len(bed3_record) == 3: - return "{0}:{1}-{2}".format(*bed3_record) - else: - raise IndexError("Incorrect number of elements in record") - - -def bed3_records_to_bed6df_dict(iterable_of_bed3: Iterable[Coordinate]) -> pd.DataFrame: - """Convert an iterable of bed3 records to a bed6 as dataframe - - Columns of the bed6df are seqid, start, end, name, strand and score - - :param iterable_of_bed3: Iterable of tuples in BED3 format (seqid, start, end). - """ - logging.info("\tbed3_records_to_bed6df_dict") - bed6_cols = ['chrom', 'start', 'end', 'name', 'score', 'strand'] - bed6_df = pd.DataFrame( - data=(bed3_record + (_bed3_to_str((bed3_record)), 0, '+') - for bed3_record in iterable_of_bed3), - columns=bed6_cols - )\ - .sort_values(by=bed6_cols[0:2])\ - - bed6df_dict = { - transcript_id: dataframe - for transcript_id, dataframe in bed6_df.groupby("chrom") - } - - return bed6df_dict - - -def bed6df_to_node2coordinates(bed6df: pd.DataFrame) -> Node2Coordinates: - """Get from the BED6 dataframe the dict of node_id : coordinates - - :param pd.DataFrame bed6df: DataFrame of BED6 records. - """ - logging.debug("\tbed6bed6df_to_node2coordinates") - # Check for extreme case: - if bed6df.shape[0] == 0: - return Node2Coordinates() - # Compute the node_id: coordinates dict - node2coordinate = bed6df\ - .sort_values(['chrom', 'start', 'end'])\ - .drop(["score", "strand"], axis=1)\ - .assign( - coordinates=bed6df - [["chrom", "start", "end"]] - .apply(tuple, axis=1) - )\ - .drop(["chrom", "start", "end"], axis=1)\ - .set_index("name", "coordinates")\ - .to_dict()["coordinates"] - - # Reprocess the dict, one node may be in multiple transcripts at once - node2coordinate = { - key: (value,) - for key, value in node2coordinate.items() - } - - return node2coordinate - - -def bed6df_to_path2node(bed6df: pd.DataFrame) -> Path2Nodes: - """Get a dict containing transcript_id to the tuple of node names that compose it in order, - indicating the path. - - :param pd.DataFrame bed6df: DataFrame with BED6 records. - """ - logging.debug("\tbed6df -> path2node") - if bed6df.shape[0] > 0: - return bed6df\ - .sort_values(['chrom', 'start', 'end'])\ - .drop(['start', 'end', 'strand', 'score'], axis=1)\ - .rename(columns={'chrom': 'path'})\ - .groupby('path')\ - .agg(lambda x: tuple(x.tolist()))\ - .to_dict()["name"] - return Path2Nodes() - - -def compute_edge_overlaps(splice_graph: SpliceGraph) -> Edge2Overlap: - """Get the dict of overlaps between exons. - - Such dict has as keys a tuple of two connected nodes and as value the overlap between them: - - - Positive overlap means that they overlap that number of bases, - - Zero that they occur next to each other - - Negative that there is a gap in the transcriptome of that number of bases - (one or multiple exons of length < kmer) - - :param splice_graph: returns: Note: the splice graph must have already the nodes written with - coordinates, and the edges alredy entered too. - - Hypothesis: node2coords.values should only hold one value - - """ - logging.debug("\tComputing edge overlaps") - - # Init - node2coords = nx.get_node_attributes( - G=splice_graph, - name='coordinates' - ) - - edge_overlaps = Edge2Overlap({edge: None for edge in splice_graph.edges()}) - - for (node1, node2) in edge_overlaps.keys(): - node1_end = node2coords[node1][0][2] - node2_start = node2coords[node2][0][1] - - # Overlap in bases, 0 means one next to the other, negative numbers a gap - overlap = node1_end - node2_start - edge_overlaps[(node1, node2)] = overlap - - return edge_overlaps - - -def build_splice_graph(bed6df: pd.DataFrame) -> SpliceGraph: - """Build the splice_graph from a dataframe of bed6 records - - splice_graph is a directed graph, whose nodes - - are an identifier, the tuple in string format - - whose attributes are - - the cooridnates in (str, int, int) format - and whose edges - - are connected exons in any way - - attributes are the overlap between them: - - positive means there is an overlap of that number of bases - - zero means no overlap - - negative means a gap of that number of bases - - :param pd.DataFrame bed6df: Exon coordinates in BED6 format - """ - logging.debug("Running build_splice_graph") - # Initialize graph - splice_graph = SpliceGraph() - - # Process nodes - logging.debug("\tAdding nodes") - splice_graph.add_nodes_from(bed6df["name"].tolist()) - nx.set_node_attributes( # Add coordinates - G=splice_graph, - name="coordinates", - values=bed6df_to_node2coordinates(bed6df) - ) - - # Process edges - logging.debug("\tAdding edges") - transcript2path = bed6df_to_path2node(bed6df) - for path in transcript2path.values(): - splice_graph.add_path(path) - nx.set_edge_attributes( - G=splice_graph, - name='overlaps', - values=compute_edge_overlaps(splice_graph) - ) - - return splice_graph - - -def build_splice_graph_dict( - bed3records: Iterable[Coordinate], args: dict) -> SpliceGraphDict: - """Build the SpliceGraphDict from a bunch of BED3 records - - :param iterable bed3records: Iterable of tuples containing bed3 records - :param dict args: args to be passed to the pipeline - """ - logging.info("Building splice graph") - - # Process bed records - bed6df_dict = bed3_records_to_bed6df_dict(bed3records) - - # Initialize pool of workers - pool = mp.Pool(args["threads"], maxtasksperchild=1) - - # Build graphs in parallel and merge results - splice_graphs = pool.map( - build_splice_graph, - bed6df_dict.values(), - chunksize=1000 # Number of Transcripts to process at one - ) - pool.close() - pool.join() - - splice_graph_dict = SpliceGraphDict(zip(bed6df_dict.keys(), splice_graphs)) - - return splice_graph_dict diff --git a/exfi/classes.py b/exfi/classes.py deleted file mode 100644 index 78ed548..0000000 --- a/exfi/classes.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python3.6 - -"""Classes for exfi: - -- FastaDict -- Node2Coordinates -- Edge2Overlap -- SpliceGraph -- SpliceGraphDict -""" - - - -from typing import \ - Tuple, Dict - -from collections import namedtuple - -import networkx as nx - -# from exfi.correct import correct_splice_graph_dict -# from exfi.collapse import collapse_splice_graph_dict -# from exfi.polish import polish_splice_graph_dict -# from exfi.io.splice_graph_dict_to_gfa1 import splice_graph_dict_to_gfa1 -# from exfi.io.gfa1_to_splice_graph_dict import gfa1_to_splice_graph_dict -# from exfi.build_splice_graph_dict import build_splice_graph_dict - - - -Coordinate = namedtuple('Coordinate', ['seqid', 'start', 'end']) - -# class Coordinate(Tuple[str, int, int]): -# """Store coordinates as tuples of the shape str, int, int""" -# def __init__(cls, seqid, start, end): -# self = (seqid, start, end) -# -# def __new__(cls, seqid, start, end): -# """I don't know what the fuck i am doing: -# http://jfine-python-classes.readthedocs.io/en/latest/subclass-tuple.html -# """ -# return tuple.__new__(cls, (seqid, start, end)) - # seqid = None - # start = None - # end = None - # - # def __init__(self, seqid: str, start: int, end: int): - # super(Coordinate, self).__init__() - # if isinstance(seqid, str) and isinstance(start, int) and isinstance(end, int): - # self.seqid = seqid - # self.start = start - # self.end = end - # else: - # type_seqid = type(seqid) - # type_start = type(start) - # type_end = type(end) - # raise TypeError( - # f"seqid must be str: {type_seqid}, " - # f"start must be int: {type_start}, and end must be" - # f" int: {type_end}." - # ) - # - # def __str__(self) -> None: - # """Print to : - notation - # - # ("chr1", 100, 150) -> "chr1:100-150" - # """ - # print(f"{self.seqid}:{self.start}-{self.end}") - # - # def exted_bases_at_start(self, bases): - # """Extend bases at start - # - # ("chr1", 100, 150) + 5 -> ("chr1", 95, 150) - # """ - # self.start -= bases - # - # def extend_bases_at_end(self, bases): - # """Extend bases at end" - # - # ("chr1", 100, 150) + 5 -> ("chr", 100, 155) - # """ - # self.end += bases - - - -class FastaDict(Dict[str, str]): - """Class for fasta dictionaries. - - FastaDict is basically a dictionary where keys are sequence indentifiers and values nucleotide - sequences as str. - """ - # pylint: disable=too-few-public-methods - pass - # @staticmethod - # def build_from_fasta(filename: str) -> None: - # """Read Fasta file""" - # fasta_dict = FastaDict() - # with open(filename, "r") as handle: - # for key, value in SimpleFastaParser(handle): - # fasta_dict[key] = value - # return fasta_dict - - - - - - - -class Node2Coordinates(Dict[str, Tuple[Coordinate, ...]]): - """Node2Coordinates is a dict where keys are node ids and values is a tuple of tuples containing - the exon coordinates with respect to the transcriptome in form (transcript, start, end).""" - # pylint: disable=too-few-public-methods - pass - - - -class Edge2Overlap(Dict[Tuple[str, str], int]): - """Edge2Overlaps is a dict where keys are tuples of two node identifiers and the value is an int - """ - # pylint: disable=too-few-public-methods - pass - - - -class SpliceGraph(nx.DiGraph): - """Class to work with single splice graphs. - - A SpliceGraph is basically a nx.DiGraph whose: - - nodes are exon identifiers, - - links are nodes that are contiguous, - - nodes have an attribute called "coordinates", which is a Node2Coordinates object. - - edges have an attribute called "overlap", which is a Edge2Overlap object. - """ - # pylint: disable=too-few-public-methods - pass - # def __init__(self) -> None: - # super(SpliceGraph).__init__() - # self = nx.DiGraph() - # - # def set_node2coordinates(self, node2coordinates: Node2Coordinates = None) -> None: - # """Use nx.set_node_attributes""" - # if isinstance(node2coordinates, Node2Coordinates): - # nx.set_node_attributes(G=self.splice_graph, name="coordinates", - # values=node2coordinates) - # else: - # raise TypeError("node2coordinates is not Node2Coordinates: ", type(node2coordinates)) - # - # def get_node2coordinates(self) -> Node2Coordinates: - # """Use nx.get_node_attributes and convert to Node2Coordinate.""" - # return Node2Coordinates(nx.get_node_attributes(G=self.splice_graph, name="cooridnates")) - # - # def add_edges(self, edges: Iterable[Tuple[str, str]]) -> None: - # """Add edges via nx.add_edges_from""" - # self.splice_graph.add_edges_from(edges) - # - # def set_edge2overlap(self, edge2overlap: Edge2Overlap = None) -> None: - # """Set edge to overlap values""" - # if isinstance(edge2overlap, Edge2Overlap): - # nx.set_edge_attributes(G=self.splice_graph, name="overlap", values=edge2overlap) - # else: - # raise TypeError("edge2overlap is not Edge2Overlap: ", type(edge2overlap)) - # - # def get_edge2overlap(self) -> Edge2Overlap: - # """Get edge2overlap data from SpliceGraph.""" - # return Edge2Overlap(nx.get_edge_attributes(G=self.splice_graph, name="overlaps")) - - - -class SpliceGraphDict(Dict[str, SpliceGraph]): - """Class to work with splice graphs in dict format "name": SpliceGraph""" - # pylint: disable=too-few-public-methods - pass - # def add_from_iterables( - # self, names: Iterable[str], splice_graphs: Iterable[SpliceGraph]) -> None: - # """Adds to SpliceGraphDict the splice_graphs_i with name names_i""" - # for name, splice_graph in zip(names, splice_graphs): - # self[name] = splice_graph - # - # def polish(self, fasta_dict: FastaDict, args: dict) -> None: - # """Polish overlaps according to the signal AGGT is found""" - # self = polish_splice_graph_dict(splice_graph_dict=self, fasta_dict=fasta_dict, args=args) - # - # def correct(self, args: dict) -> None: - # """Use abyss-sealer to correct/merge exons""" - # self = correct_splice_graph_dict(splice_graph_dict=self, args=args) - # - # def collapse(self, transcriptome_dict: FastaDict) -> None: - # """Create a new splice graph by merging all exons by sequence identity""" - # self = collapse_splice_graph_dict( - # splice_graph_dict=self, transcriptome_dict=transcriptome_dict) - # - # def write_to_gfa1(self, transcriptome_dict: FastaDict, filename: str) -> None: - # """Write SpliceGraphDict to GFA1 file""" - # splice_graph_dict_to_gfa1( - # splice_graph_dict=self, transcriptome_dict=transcriptome_dict, filename=filename) - # - # @staticmethod - # def load_from_bed3_records( - # bed3_records: Iterable[Coordinate], args: dict) -> None: - # """Build the SpliceGraphDict from BED3 records""" - # return SpliceGraphDict(build_splice_graph_dict(bed3records=bed3_records, args=args)) - # - # @staticmethod - # def load_from_gfa1_file(filename) -> None: - # """Build SpliceGraphDict from a GFA1 file""" - # return SpliceGraphDict(gfa1_to_splice_graph_dict(handle=filename)) - - - -class Path2Nodes(Dict[str, Tuple[str, ...]]): - """Class to store the dict transcript_id: [node1, ..., nodeN]""" - # pylint: disable=too-few-public-methods - pass diff --git a/exfi/collapse.py b/exfi/collapse.py deleted file mode 100644 index e810b23..0000000 --- a/exfi/collapse.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 - -""" -exfi.collapse_splice_graph: merge nodes with the exact same sequence -""" - -import logging - -from typing import \ - Dict, Tuple - -import networkx as nx - -from exfi.classes import FastaDict, Node2Coordinates, Edge2Overlap, SpliceGraph, SpliceGraphDict - -def _compute_seq2node( - node2coord: Node2Coordinates, transcriptome_dict: FastaDict) -> Dict[str, Tuple[str, ...]]: - """Compute seq2node dict - - From - - a dict node2coord = {node_id: tuples of coordinates}, and - - a dict transcriptome_dict = {transcript_id: str}, - build the dict - - seq2node = {sequence(str): node_id (str)} - - :param node2coord: node to coordinates dict. - :param transcriptome_dict: transcript to sequence dict. - """ - # Get the node -> sequence - logging.debug("\t_compute_seq2node") - seq2node: Dict[str, Tuple[str, ...]] = {} - for node_id, coordinates in node2coord.items(): - seqid, start, end = coordinates[0] # Just take the first - sequence = transcriptome_dict[seqid][start:end] - if sequence not in seq2node: - seq2node[sequence] = () - seq2node[sequence] += (node_id, ) - return seq2node - - -def _compute_old2new(seq2node: Dict[str, Tuple[str, ...]]) -> Dict[str, str]: - """Compute the dict of old identifiers to new - - :param seq2node: Sequence to node dict - """ - logging.debug("\t_compute_old2new") - old2new = {} - for i, old_nodes in enumerate(seq2node.values()): - new_node = "exon_{exon_number:08d}".format(exon_number=i) - for old_node in old_nodes: - old2new[old_node] = new_node - return old2new - - -def _compute_new_node2coord(old2new: dict, node2coord: Node2Coordinates) -> Node2Coordinates: - """Recompute the node to coordinate dict - - :param old2new: dict of old to new names - :param node2coord: node to coordinates dict - - """ - logging.debug("\t_compute_new_node2coord") - # Compute the new set coordinates of each node - new_node2coord = Node2Coordinates() - for old_id, new_id in old2new.items(): - if new_id not in new_node2coord: - new_node2coord[new_id] = () - new_node2coord[new_id] += node2coord[old_id] - - return new_node2coord - - -def _compute_new_link2overlap(old2new: Dict[str, str], link2overlap: Edge2Overlap) -> Edge2Overlap: - """Recompute the link2overlaps dict accordint to the new node_ids - - :param old2new: dict of old to new names - :param link2overlap: old edge2overlap dict - - """ - logging.debug("\t_compute_new_link2overlap") - # Compute the new set of edges and overlaps - new_link2overlap = Edge2Overlap() - for (node_from, node_to), overlap in link2overlap.items(): - new_from = old2new[node_from] - new_to = old2new[node_to] - new_link2overlap[(new_from, new_to)] = overlap - return new_link2overlap - - -def _merge_node2coords(splice_graph_dict: SpliceGraphDict) -> Node2Coordinates: - """Take all node2coord from every graph and merge into a single dict - - :param splice_graph_dict: Dict of name: SpliceGraph. - """ - logging.debug("\t_merge_node2coords") - node2coord_big = Node2Coordinates() - for splice_graph in splice_graph_dict.values(): - # Get node and edge data - node2coord = nx.get_node_attributes(G=splice_graph, name="coordinates") - - for node, coordinate in node2coord.items(): - if node not in node2coord_big: - node2coord_big[node] = tuple() - node2coord_big[node] += coordinate - - return node2coord_big - - -def _merge_link2overlap(splice_graph_dict: SpliceGraphDict) -> Edge2Overlap: - """Take all link2overlap from every graph and merge into a single dict - - :param splice_graph_dict: Dict of name -> SpliceGraph. - """ - logging.debug("\t_merge_link2overlap") - link2overlap_big = Edge2Overlap() - - for splice_graph in splice_graph_dict.values(): - link2overlap = nx.get_edge_attributes(G=splice_graph, name="overlaps") - for edge, overlap in link2overlap.items(): - link2overlap_big[edge] = overlap - - return link2overlap_big - - -def collapse_splice_graph_dict( - splice_graph_dict: SpliceGraphDict, transcriptome_dict: FastaDict) -> SpliceGraph: - """Collapse nodes by sequence identity - - :param splice_graph_dict: Dict of name -> SpliceGraph. - :param transcriptome_dict: Dict of transcript ids -> sequence. - - """ - logging.info("Collapsing graph by sequence") - - node2coord = _merge_node2coords(splice_graph_dict) - link2overlap = _merge_link2overlap(splice_graph_dict) - - seq2node = _compute_seq2node(node2coord, transcriptome_dict) - old2new = _compute_old2new(seq2node) - del seq2node - - new_node2coord = _compute_new_node2coord(old2new, node2coord) - new_link2overlap = _compute_new_link2overlap(old2new, link2overlap) - - # Build graph - collapsed_graph = nx.DiGraph() - collapsed_graph.add_nodes_from(new_node2coord.keys()) - collapsed_graph.add_edges_from(new_link2overlap.keys()) - nx.set_node_attributes( - G=collapsed_graph, name="coordinates", values=new_node2coord) - nx.set_edge_attributes( - G=collapsed_graph, name="overlaps", values=new_link2overlap) - - return collapsed_graph diff --git a/exfi/correct.py b/exfi/correct.py old mode 100644 new mode 100755 index 0506911..e436665 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -1,133 +1,73 @@ #!/usr/bin/env python3 -""" -exfi.correct_splice_graph.py: functions to take a splice graph and try to fill hypothetical gaps -with abyss-sealer. -""" +"""exfi.new_correct.py: fill small overlaps and gaps with abyss-sealer""" -import logging - -from typing import Dict, Union, Tuple - -from subprocess import \ - Popen from tempfile import \ mkstemp -from os import remove - -import networkx as nx - -import pathos.multiprocessing as mp - -from Bio import \ - SeqIO, \ - Seq, \ - SeqRecord +from subprocess import Popen +import os -from natsort import \ - natsorted +import pandas as pd -from exfi.io.fasta_to_dict import \ - fasta_to_dict +from exfi.io.bed import \ + bed3_to_bed4, \ + bed4_to_node2sequence, \ + bed4_to_edge2overlap -from exfi.classes import FastaDict, Coordinate, Node2Coordinates, Edge2Overlap, SpliceGraph, \ - SpliceGraphDict - -def _get_node2sequence(splice_graph: SpliceGraph, transcriptome_dict: FastaDict) -> Dict[str, str]: - """From the splice graph and a transcriptome, get the exon: sequence dictionary - - :param dict splice_graph: DiGraph from where to extract the node2sequence data. - :param dict transcriptome_dict: Dict of reference transcriptome. +def prepare_sealer(bed4, transcriptome_dict, args): + """exfi.new_correct.prepare_sealer: inspect the bed4 file and create a fasta + file where pairs of exons have a small gap between them or have a small + overlap. """ - logging.debug("\tComputing the exon to sequence dictionary") - node2sequence: Dict[str, str] = {} - - node2coordinates = Node2Coordinates(nx.get_node_attributes( - G=splice_graph, - name="coordinates" - )) - - for node, coordinates in node2coordinates.items(): - for (transcript_id, start, end) in coordinates: - sequence = transcriptome_dict[transcript_id][start:end] - node2sequence[node] = sequence - return node2sequence + sealer_input = mkstemp() -def _prepare_sealer(splice_graph_dict: SpliceGraphDict, args: dict) -> str: - """Prepare fasta file with candidates to be filled with sealer. Return the path - of the fasta file to be sealed. + max_fp_bases = args["max_fp_bases"] + max_gap_size = args["max_gap_size"] - Candidates to be sealed: - - pairs of exons with mall gaps (size <= max_gap_size) - - pairs of exons with any kind of positive overlap + node2sequence = bed4_to_node2sequence(bed4, transcriptome_dict) + edge2overlap = bed4_to_edge2overlap(bed4) + node2sequence_dict = node2sequence.set_index("name").to_dict()["sequence"] - args = { - "input_fasta": str, - "kmer": int, - "max_gap_size": int, - "input_bloom": str, - "max_fp_bases": int - } - - :param dict splice_graph_dict: Dict of SpliceGraphs. - :param dict args: dict of arguments for abyss-sealer. - """ + small_gaps = edge2overlap\ + .loc[(edge2overlap.overlap < 0) & (edge2overlap.overlap <= max_gap_size)] - logging.debug("\tPreparing input for abyss-sealer") - transcriptome_dict = fasta_to_dict(args["fasta"]) + small_gaps["data_to_map"] = tuple(zip(small_gaps.u, small_gaps.v)) - # Prepare fasta for sealer - # Make temporary fasta where to write sequences for sealer - sealer_input = mkstemp() - sequences_to_seal = list() + small_gaps["identifier"] = small_gaps.u + "~" + small_gaps.v - for splice_graph in splice_graph_dict.values(): + small_gaps["sequence"] = small_gaps.data_to_map\ + .map( + lambda x: \ + node2sequence_dict[x[0]][0:-max_fp_bases] + \ + 100 * 'N' + \ + node2sequence_dict[x[1]][max_fp_bases:] + ) - # Get overlap and sequence data - edge2overlap = nx.get_edge_attributes(G=splice_graph, name="overlaps") - node2sequence = _get_node2sequence( - splice_graph=splice_graph, - transcriptome_dict=transcriptome_dict + small_gaps = small_gaps[["identifier", "sequence"]] + + overlaps = edge2overlap.loc[edge2overlap.overlap >= 0] + overlaps["data_to_map"] = tuple(zip(overlaps.u, overlaps.v, overlaps.overlap)) + overlaps["identifier"] = overlaps.u + "~" + overlaps.v + overlaps["sequence"] = overlaps.data_to_map\ + .map( + lambda x: \ + node2sequence_dict[x[0]][0:-x[2] - max_fp_bases] + \ + 100 * 'N' + \ + node2sequence_dict[x[1]][x[2] + max_fp_bases:] ) + overlaps = overlaps[["identifier", "sequence"]] + + for_sealer = pd.concat([small_gaps, overlaps]) + for_sealer["fasta"] = ">" + for_sealer["identifier"] + "\n" + for_sealer["sequence"] + "\n" + for_sealer = for_sealer[["fasta"]] - for (node1, node2), overlap in edge2overlap.items(): - overlap = edge2overlap[(node1, node2)] - if overlap < 0 and overlap <= args["max_gap_size"]: # Small gap - identifier = node1 + "~" + node2 - sequence = Seq.Seq( - node2sequence[node1][0:-args["max_fp_bases"]] - + "N" * 100 - + node2sequence[node2][args["max_fp_bases"]:] - ) - seqrecord = SeqRecord.SeqRecord( - id=identifier, - description="", - seq=sequence - ) - sequences_to_seal.append(seqrecord) - elif overlap >= 0: - # Trim overlap bases from one of the threads - identifier = node1 + "~" + node2 - # Cut overlap from one end, again from the other, and from both to see what happens - - # Both - sequence = Seq.Seq( - node2sequence[node1][0:-overlap - args["max_fp_bases"]] - + "N" * 100 - + node2sequence[node2][overlap + args["max_fp_bases"]:] - ) - seqrecord = SeqRecord.SeqRecord( - id=identifier, - description="", - seq=sequence - ) - sequences_to_seal.append(seqrecord) - - SeqIO.write(format="fasta", handle=sealer_input[1], sequences=sequences_to_seal) + with open(sealer_input[1], "w", 1*1024**3) as f_in: + for fasta_record in for_sealer.fasta.values: + f_in.write(fasta_record) return sealer_input[1] @@ -146,7 +86,7 @@ def _run_sealer(sealer_input_fn: str, args: dict) -> str: :param dict args: Dict of argumnets for sealer """ - logging.debug("\tRunning abyss-sealer") + #logging.debug("\tRunning abyss-sealer") # Run sealer sealer_output_prefix = mkstemp() c_sealer = [ @@ -167,255 +107,53 @@ def _run_sealer(sealer_input_fn: str, args: dict) -> str: p_sealer.communicate() # Clean files - remove(sealer_output_prefix[1] + "_log.txt") - remove(sealer_output_prefix[1] + "_scaffold.fa") - remove(sealer_output_prefix[1]) - return sealer_output_prefix[1] + "_merged.fa" - - -def _collect_sealer_results(handle: str) -> set: - """Process extensions from sealer and return the computed extensions in a set of pairs of - tuples: set((node1, node2), ... , (nodeN-1, nodeN)) - - :param str handle: Output filename from abyss-sealer. - """ - logging.debug("\tCollecting abyss-sealer results") - # Collect results - filled_edges = set() - for corrected in fasta_to_dict(handle).keys(): - node1, node2 = corrected.rsplit("_", 2)[0].split("~") - filled_edges.add((node1, node2)) - return filled_edges - - -def _filled_edges_by_transcript(filled_edges: set) -> Dict[str, set]: - """Split the edge2fill by the transcript they belong. Result is - dict(transcript_id: set) - - :param set filled_edges: set of pairs of filled edges. - """ - logging.debug("\tSplitting sealer results by transcript") - filled_edges_by_transcript: Dict[str, set] = {} - for node_u, node_v in filled_edges: - transcript = node_u.rsplit(":")[0] - if transcript not in filled_edges_by_transcript: - filled_edges_by_transcript[transcript] = set() - filled_edges_by_transcript[transcript].add((node_u, node_v)) - return filled_edges_by_transcript - - -def _rename_nodes_from_collapse(quotient_graph: SpliceGraph) -> Dict[str, str]: - """Compose the new_node ids from nx.quotient to str or tuples of strs - - :param nx.DiGraph quotient_graph: DiGraph with collapsed nodes with nx.qoutient. - """ - logging.debug("\tRenaming collapsed nodes") - # Main dict - mapping: Dict[str, Union[str, Tuple[str, ...]]] = { # Old -> New - node_id: tuple(natsorted(node for node in node_id)) - for node_id in quotient_graph.nodes() - } - - # Convert single item tuples to str - for key, value in mapping.items(): - if len(value) == 1: - mapping[key] = value[0] - return mapping - + os.remove(sealer_output_prefix[1] + "_log.txt") + os.remove(sealer_output_prefix[1] + "_scaffold.fa") + os.remove(sealer_output_prefix[1]) + return sealer_output_prefix[1] + "_merged.fa" -def _recompute_node2coord( - component: SpliceGraph, quotient_relabeled: SpliceGraph) -> Node2Coordinates: - """Compute the new node2coord for the quotient graph - - :param nx.DiGraph component: old SpliceGraph. - :param nx.DiGraph quotient_relabeled: new SpliceGraph, already relabeled. - """ - logging.debug("\tRecomputing node2coord") - # Get the new_node2coord data - old_node2coord = Node2Coordinates(nx.get_node_attributes(G=component, name="coordinates")) - new_node2coord = Node2Coordinates() - for nodes in quotient_relabeled.nodes(): - if isinstance(nodes, str): # node is untouched in quotient - new_node2coord[nodes] = old_node2coord[nodes] - elif isinstance(nodes, tuple): # node was touched - first_old_node = nodes[0] - last_old_node = nodes[-1] - transcript, start, _ = old_node2coord[first_old_node][0] - _, _, end = old_node2coord[last_old_node][0] - new_node2coord[nodes] = (Coordinate(transcript, start, end), ) - return new_node2coord - - -def _recompute_edge2overlap( - component: SpliceGraph, quotient_relabeled: SpliceGraph) -> Edge2Overlap: - """Compute the new node2coord for the quotient graph - - :param nx.DiGraph component: old SpliceGraph. - :param nx.DiGraph quotient_relabeled: new SpliceGraph. - """ - logging.debug("\tRecomputing edge2overlap") - - old_edge2overlap = Edge2Overlap(nx.get_edge_attributes(G=component, name="overlaps")) - new_edge2overlap = dict() - - for edge in quotient_relabeled.edges(): - node_u, node_v = edge - - if isinstance(node_u, tuple) and isinstance(node_v, tuple): - new_edge2overlap[(node_u, node_v)] = old_edge2overlap[(node_u[-1], node_v[0])] - elif isinstance(node_u, tuple) and isinstance(node_v, str): - new_edge2overlap[(node_u, node_v)] = old_edge2overlap[(node_u[-1], node_v)] - elif isinstance(node_u, str) and isinstance(node_v, tuple): - new_edge2overlap[(node_u, node_v)] = old_edge2overlap[(node_u, node_v[0])] - else: - new_edge2overlap[(node_u, node_v)] = old_edge2overlap[(node_u, node_v)] - - return new_edge2overlap - - -def _compute_new_node_ids( - quotient_relabeled: SpliceGraph, component: SpliceGraph) -> Dict[str, str]: - """Compose the new node id for every collapsed node in nx.quotient_graph - - :param nx.DiGraph quotient_relabeled: new SpliceGraph. - :param nx.DiGraph component: old SpliceGraph. - """ - - logging.debug("\tRecomputing final node identifiers") - - quotient_mapping: Dict[str, str] = {} - old_node2coord = nx.get_node_attributes(G=component, name="coordinates") - - for node in quotient_relabeled.nodes(): - - if isinstance(node, tuple): # Collapsed node - - # Compute starting node and coordinates - node = tuple(natsorted(node)) - first_node_id = node[0] - last_node_id = node[-1] - transcript_id, start, _ = old_node2coord[first_node_id][0] - _, _, end = old_node2coord[last_node_id][0] - - # Compose new node - new_node_id = f"{transcript_id}:{start}-{end}" - quotient_mapping[node] = new_node_id - else: - quotient_mapping[node] = node - - return quotient_mapping - - -def _sculpt_graph(splice_graph: SpliceGraph, filled_edges: set) -> SpliceGraph: - """Apply sealer corrections in filled_edges to the splice graph - - :param nx.DiGraph splice_graph: SpliceGraph to be corrected. - :param set filled_edges: set of edges to be merged. - """ - - logging.debug("\tSculpting graph") - - if not filled_edges: - return splice_graph - - # Compute the quotient graph - def partition(node_u: str, node_v: str) -> bool: - """Function to test if node_u and node_v belong to the same partition of the graph - - :param node_u: - :param node_v: - - """ - graph = SpliceGraph() - graph.add_edges_from(filled_edges) - if node_u in graph.nodes() and \ - node_v in graph.nodes() and \ - nx.has_path(G=graph, source=node_u, target=node_v): - return True - return False - # Compute the quotient graph - quotient = nx.quotient_graph(G=splice_graph, partition=partition) +def collect_sealer_results(filename): + """Read the fasta output from sealer and return the merged nodes""" - # Rename nodes (frozensets are tricky) - mapping_sg_to_partition = _rename_nodes_from_collapse(quotient) - quotient_relabeled = nx.relabel_nodes( - G=quotient, - mapping=mapping_sg_to_partition - ) + if os.path.getsize(filename) == 0: + return pd.DataFrame(data=None, columns=["raw"]) - # Recompute graph info - node2coord = _recompute_node2coord(splice_graph, quotient_relabeled) - edge2overlap = _recompute_edge2overlap(splice_graph, quotient_relabeled) - node_ids = _compute_new_node_ids(component=splice_graph, quotient_relabeled=quotient_relabeled) + headers = pd.read_csv(filename, header=None, sep="\t") + print(headers) + headers = headers.iloc[::2] # Take odd rows: headers. - # Set new info - nx.set_node_attributes(G=quotient_relabeled, name="coordinates", values=node2coord) - nx.set_edge_attributes(G=quotient_relabeled, name="overlaps", values=edge2overlap) - component_final = nx.relabel_nodes(G=quotient_relabeled, mapping=node_ids) + headers.columns = ["raw"] + headers["clean"] = headers\ + .raw\ + .str.slice(1)\ + .str.rsplit("_", 2).str[0]\ + .str.split("~") + headers["u"], headers["v"] = headers.clean.str + headers = headers[["u", "v"]] + return headers - return component_final +def apply_correction_to_bed4(bed4, sealed_edges): + """Merge nodes into a single ones, being careful with the coordinates""" + if sealed_edges.shape[0] == 0: + return bed4 + new_bed4 = bed4.copy().set_index("name") + for row in sealed_edges.iloc[::-1].itertuples(): + new_bed4.loc[row.u, "chromEnd"] = new_bed4.loc[row.v, "chromStart"] + new_bed4 = new_bed4.drop(sealed_edges["v"].values) + new_bed4 = bed3_to_bed4(new_bed4[["chrom", "chromStart", "chromEnd"]]) + return new_bed4 -def correct_splice_graph_dict(splice_graph_dict: SpliceGraphDict, args: dict) -> SpliceGraphDict: - """Try to correct small gaps and some overlaps (SNPs and indels) with abyss-sealer - args = { - "kmer": int, - "max_gap_size": int, - "input_bloom": str, - "input_fasta": str - } - :param dict splice_graph_dict: Dict of splice graphs, one per transcript. - :param dict args: arguments for abyss-sealer +def correct_bed4(bed4, transcriptome_dict, args): + """Inspect the bed4 for small gaps and overlaps, write a fasta file for + sealer, and correct the bed4. """ - logging.info("Correct splice graph with abyss-sealer") - - # Compose fasta with candidates to be filled - sealer_input_fn = _prepare_sealer( - splice_graph_dict=splice_graph_dict, - args=args - ) - - # Run sealer - sealer_output_fn = _run_sealer( - sealer_input_fn=sealer_input_fn, - args=args - ) - - # Collect results - logging.info("\tCollecting results") - filled_edges = _collect_sealer_results(handle=sealer_output_fn) - remove(sealer_input_fn) - remove(sealer_output_fn) - filled_edges_by_transcript = _filled_edges_by_transcript( - filled_edges=filled_edges) - del filled_edges - - # Complete the filled_edges_by_transcript dict - for transcript in splice_graph_dict: - if transcript not in filled_edges_by_transcript: - filled_edges_by_transcript[transcript] = set() - - # Initialize pool of workers - pool = mp.Pool(args["threads"], maxtasksperchild=1) - - # Process each graph in parallel - logging.info("\tCorrecting each splice graph") - corrected_splice_graphs = pool.starmap( - _sculpt_graph, - zip(splice_graph_dict.values(), filled_edges_by_transcript.values()), - chunksize=1000 # Number of splice graphs to process at once. - ) - pool.close() - pool.join() - - - splice_graph_dict = SpliceGraphDict( - zip(splice_graph_dict.keys(), corrected_splice_graphs) - ) - - logging.info("\tDone correcting") - return splice_graph_dict + sealer_input_fn = prepare_sealer(bed4=bed4, transcriptome_dict=transcriptome_dict, args=args) + output_sealer_fn = _run_sealer(sealer_input_fn=sealer_input_fn, args=args) + sealer_results = collect_sealer_results(filename=output_sealer_fn) + bed4_corrected = apply_correction_to_bed4(bed4, sealer_results) + return bed4_corrected diff --git a/exfi/find_exons.py b/exfi/find_exons.py index 1a913c1..578bd20 100644 --- a/exfi/find_exons.py +++ b/exfi/find_exons.py @@ -9,35 +9,37 @@ """ -# Import everything - -from typing import \ - Iterable, \ - Tuple - import logging from subprocess import Popen, PIPE -from exfi.classes import Coordinate, FastaDict +import pandas as pd +import numpy as np -def _process_output(process: Popen) -> Iterable[Coordinate]: +def _process_output(process): """Get lines in bed format from the output of a Popen. :param Popen process: Popen object. """ - for stdout_line in iter(process.stdout.readline, b''): - chromosome, start, end = stdout_line.decode().strip().split() - coordinate = Coordinate(chromosome, int(start), int(end)) - yield coordinate - # yield Coordinate(stdout_line.decode().strip().split()) + + bed3 = pd.DataFrame( + data=[ + stdout_line.decode().strip().split() + for stdout_line in iter(process.stdout.readline, b'') + ], + columns=["chrom", "chromStart", "chromEnd"] + ) + + bed3.chromStart = bed3.chromStart.astype(np.int64) + bed3.chromEnd = bed3.chromEnd.astype(np.int64) + process.stdout.close() process.wait() + return bed3 + -def _get_fasta( - transcriptome_dict: FastaDict, iterable_of_bed: Iterable[Coordinate]) -> \ - Iterable[Tuple[str, str]]: +def _get_fasta(transcriptome_dict, iterable_of_bed): """Extract subsequences in trancriptome_fn according to locis. :param dict transcriptome_dict: FastaDict of the transcriptome @@ -51,7 +53,7 @@ def _get_fasta( yield (identifier, seq) -def _find_exons_pipeline(args: dict) -> Iterable[Coordinate]: +def _find_exons_pipeline(args): """Find exons according to the Bloom filter -> BED Main pipeline: @@ -89,4 +91,4 @@ def _find_exons_pipeline(args: dict) -> Iterable[Coordinate]: p_filter = Popen(c_filter, stdin=p_merge1.stdout, stdout=PIPE) p_merge2 = Popen(c_merge2, stdin=p_filter.stdout, stdout=PIPE) p_kmers.stdout.close() - yield from _process_output(p_merge2) + return _process_output(p_merge2) diff --git a/exfi/io/__init__.py b/exfi/io/__init__.py index c9fc85b..ae26620 100644 --- a/exfi/io/__init__.py +++ b/exfi/io/__init__.py @@ -9,20 +9,20 @@ - Data frames, ... """ -import logging - -from typing import \ - Tuple - -from exfi.classes import Coordinate - - -def _coordinate_to_str(coordinate: Coordinate) -> str: - """Convert coordinates to str seqid:start-end - - :param str seqid: Sequence identifier - :param int start: Start position - :param int end: End position - """ - seqid, start, end = coordinate - return f"{seqid}:{start}-{end}" +# import logging +# +# from typing import \ +# Tuple +# +# from exfi.classes import Coordinate +# +# +# def _coordinate_to_str(coordinate: Coordinate) -> str: +# """Convert coordinates to str seqid:start-end +# +# :param str seqid: Sequence identifier +# :param int start: Start position +# :param int end: End position +# """ +# seqid, start, end = coordinate +# return f"{seqid}:{start}-{end}" diff --git a/exfi/io/bed.py b/exfi/io/bed.py new file mode 100644 index 0000000..4bacb6f --- /dev/null +++ b/exfi/io/bed.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +"""exfi.io.bed.py: submodule to wrangle BED dataframes""" + + +def bed3_to_bed4(bed3): + """Take a BED3 dataframe and add the name as: + "chrom:chromStart+chromEnd" + """ + bed4 = bed3.copy() + bed4["name"] = \ + bed4.chrom + ":" + \ + bed4.chromStart.map(str) + "-" + \ + bed4.chromEnd.map(str) + return bed4 + + +def bed4_to_node2coordinates(bed4): + """Compute the node2coordinates DataFrame: exon name, chrom, start, end""" + node2coordinates = bed4\ + [["name", "chrom", "chromStart", "chromEnd"]]\ + .set_index("name") + return node2coordinates + + +def bed4_to_path2nodes(bed4): + """Compute the correspondance between a transcript and its exons: + {transcript_id : list of exons}. + """ + return bed4\ + .drop(columns=["chromStart", "chromEnd"])\ + .groupby("chrom")\ + .agg(lambda x: x.tolist())\ + .to_dict()["name"] + + +def bed4_to_node2sequence(bed4, transcriptome_dict): + """Compute the correspondence between an exon name and its sequence as a + pd.DataFrame with cols name and sequence. + """ + node2sequence = bed4.copy() + node2sequence["sequence"] = node2sequence.chrom.map(transcriptome_dict) + node2sequence["data_to_map"] = list(zip( + node2sequence.sequence, + node2sequence.chromStart, + node2sequence.chromEnd + )) + node2sequence.sequence = node2sequence.data_to_map.map(lambda x: x[0][x[1]:x[2]]) + return node2sequence[["name", "sequence"]] + + +def bed4_to_edge2overlap(bed4): + """Compute the overlaps between a pair of overlapping exons. + Dataframe is name_u, name_v, overlap_int. + """ + overlaps = bed4.copy() + # Get the transcript_id of the next exon + overlaps["chromNext"] = overlaps["chrom"].shift(-1) + # Get the name of the next exon + overlaps["nameNext"] = overlaps["name"].shift(-1) + # Get the start of the next exon + overlaps["chromStartNext"] = overlaps["chromStart"].shift(-1) + # Get the end of the next exon + overlaps["chromEndNext"] = overlaps["chromEnd"].shift(-1) + # Remove rows with different transcripts + overlaps = overlaps\ + [overlaps["chrom"] == overlaps["chromNext"]] + # Convert types + overlaps = overlaps.astype({"chromStartNext": int, "chromEndNext": int}) + # Compute the overlap + overlaps["overlap"] = overlaps["chromEnd"] - overlaps["chromStartNext"] + # Convert again just in case + overlaps.astype({"overlap": int}) + # Select and rename + overlaps = overlaps\ + [["name", "nameNext", "overlap"]]\ + .rename({"name": "u", "nameNext": "v"}, axis=1) + return overlaps diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py new file mode 100644 index 0000000..55c72ca --- /dev/null +++ b/exfi/io/bed4_to_gfa1.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 + +"""exfi.io.bed4_to_gfa1.py: submodule to write a BED4 dataframe to GFA1 format +""" + +import pandas as pd + +from exfi.io.bed import \ + bed4_to_node2sequence, \ + bed4_to_edge2overlap + +def compute_header(): + """Write GFA1 header""" + header = pd.DataFrame( + data=[["H", "VN:Z:1.0"]], + columns=["RecordType", "Version"] + ) + return header + + +def compute_segments(bed4, transcriptome_dict): + """Create the Segments subdataframe for GFA1 file""" + segments = bed4_to_node2sequence(bed4=bed4, transcriptome_dict=transcriptome_dict) + # Add the S and length columns + segments["RecordType"] = "S" + segments["SegmentLength"] = segments.sequence.map(lambda x: "LN:i:" + str(len(x))) + # reorder + segments = segments\ + [["RecordType", "name", "sequence", "SegmentLength"]] + return segments + + +def compute_links(bed4): + """Compute the Links subdataframe of a GFA1 file.""" + links = bed4_to_edge2overlap(bed4=bed4) + links.columns = ["From", "To", "Overlap"] + links["RecordType"] = "L" + links["FromOrient"] = "+" + links["ToOrient"] = "+" + links["Overlap"] = links.Overlap.map(lambda x: str(x) + "M" if x >= 0 else str(-x) + "N") + links = links[["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"]] + return links + + +def compute_containments(bed4): + """Create the minimal containments subdataframe""" + containments = bed4.copy() + containments["RecordType"] = "C" + containments["Container"] = containments["chrom"] + containments["ContainerOrient"] = "+" + containments["Contained"] = containments["name"] + containments["ContainedOrient"] = "+" + containments["Pos"] = containments["chromStart"] + containments["Overlap"] = containments["chromEnd"] - containments["chromStart"] + containments["Overlap"] = containments.Overlap.map(lambda x: str(x) + "M") + containments = containments.drop(["chrom", "chromStart", "chromEnd", "name"], axis=1) + return containments + + +def compute_paths(bed4): + """Compute the Paths section of the GFA1 file""" + paths = bed4.copy() + paths["name"] = paths["name"].map(lambda x: x + "+") + paths = paths\ + .drop(columns=["chromStart", "chromEnd"])\ + .groupby("chrom", axis=0)\ + .aggregate(lambda x: ",".join(x.tolist())) + paths = paths.reset_index(drop=False) + paths["RecordType"] = "P" + paths = paths.rename({"chrom": "PathName", "name": "SegmentNames"}, axis=1) + paths["Overlaps"] = "*" + paths = paths[["RecordType", "PathName", "SegmentNames", "Overlaps"]] + return paths + + +def bed4_to_gfa1(gfa1_fn, bed4, transcriptome_dict): + """Convert the BED4 dataframe into a GFA1 file""" + with open(gfa1_fn, "w", 1024**3) as gfa: + compute_header()\ + .to_csv(gfa, sep="\t", header=False, index=False) + with open(gfa1_fn, "a", 1024**3) as gfa: + compute_segments(bed4, transcriptome_dict)\ + .to_csv(gfa, sep="\t", header=False, index=False) + compute_links(bed4)\ + .to_csv(gfa, sep="\t", header=False, index=False) + compute_containments(bed4)\ + .to_csv(gfa, sep="\t", header=False, index=False) + compute_paths(bed4)\ + .to_csv(gfa, sep="\t", header=False, index=False) diff --git a/exfi/io/components.py b/exfi/io/components.py deleted file mode 100644 index ab0bce1..0000000 --- a/exfi/io/components.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.join_components: Submodule to convert a dict of splice graphs into a single splice graph -""" - -import logging - -import networkx as nx - -from natsort import natsorted - -from exfi.classes import \ - SpliceGraph, \ - SpliceGraphDict - -def join_components(dict_of_components: SpliceGraphDict) -> SpliceGraph: - """Merge all splice graphs in dict_of_components into a single splice_graph - - :param dict dict_of_components: Dict of Splice Graphs, key is transcript_id, value is nx.DiGraph - """ - - logging.info("\tJoining multiple splice graphs into one") - - # Join everything into a splice_graph - joint = nx.DiGraph() - - # Nodes - logging.info("\t\tProcessing nodes") - node2coordinate = { - node: coordinate - for subgraph in dict_of_components.values() - for node, coordinate in nx.get_node_attributes(G=subgraph, name="coordinates").items() - } - - joint.add_nodes_from(node2coordinate.keys()) - nx.set_node_attributes(G=joint, name="coordinates", values=node2coordinate) - - # Edges - logging.info("\t\tProcessing edges") - edge2overlap = { - edge: overlap - for subgraph in dict_of_components.values() - for edge, overlap in nx.get_edge_attributes(G=subgraph, name="overlaps").items() - } - joint.add_edges_from(edge2overlap.keys()) - nx.set_edge_attributes(G=joint, name="overlaps", values=edge2overlap) - - return joint - - -def split_into_components(splice_graph: nx.DiGraph) -> dict: - """Convert a single splice graph into a dict of splice graphs - - - keys are transcript_ids - - values are the splice graph of that transcript - - :param nx.DiGraph splice_graph: Graph to be splitted into connected components. - """ - logging.info("\tSplitting directed graph into directed components") - # Compute connected components - logging.info("\t\tComputing undirected components") - undirected_components = nx.connected_component_subgraphs( - G=splice_graph.to_undirected()) - - component_dict = {} - - node2coord_big = nx.get_node_attributes(G=splice_graph, name="coordinates") - edge2overlap_big = nx.get_edge_attributes(G=splice_graph, name="overlaps") - - logging.info("\t\tComputing directed components") - for undirected_component in undirected_components: - - # Get the transcript_id of the component - logging.info("\t\t\tGetting component info") - nodes = tuple(x for x in undirected_component.nodes()) - a_node = nodes[0] - transcript = undirected_component.node[a_node]["coordinates"][0][0] - logging.info("\t\t\tProcessing component %s", transcript) - - logging.info("\t\t\t\tGetting node2coord") - node2coord = { - node: node2coord_big[node] - for node in undirected_component.nodes() - } - - logging.info("\t\t\t\tGetting edge2overlap") - edges = { - tuple(natsorted([node_u, node_v])) - for node_u, node_v in undirected_component.edges() - } - edge2overlap = { - edge: edge2overlap_big[edge] - for edge in edges - } - - # Re-create directed graph - # Nodes - logging.info("\t\t\t\tCreating empty graph") - directed_component = nx.DiGraph() - - logging.info("\t\t\t\tAdding nodes") - directed_component.add_nodes_from(node2coord.keys()) - nx.set_node_attributes( - G=directed_component, - name="coordinates", - values=node2coord - ) - # Edges - logging.info("\t\t\t\tAdding edges") - directed_component.add_edges_from(edge2overlap.keys()) - nx.set_edge_attributes( - G=directed_component, - name="overlaps", - values=edge2overlap - ) - - # Store directed component in its position - component_dict[transcript] = directed_component - - return component_dict diff --git a/exfi/io/fasta_to_dict.py b/exfi/io/fasta_to_dict.py index 3854218..a82c2e8 100644 --- a/exfi/io/fasta_to_dict.py +++ b/exfi/io/fasta_to_dict.py @@ -5,16 +5,16 @@ from Bio.SeqIO.FastaIO import \ SimpleFastaParser -from exfi.classes import FastaDict +# from exfi.classes import FastaDict -def fasta_to_dict(filename: str) -> FastaDict: +def fasta_to_dict(filename): """Fast Fasta to dict via SimpleFastaParser :param filename: str: Path to the fasta file """ with open(filename, "r") as handle: - return FastaDict( - (identifier.split()[0], sequence) + return { + identifier.split()[0]: sequence for identifier, sequence in SimpleFastaParser(handle) - ) + } diff --git a/exfi/io/gfa1_to_bed.py b/exfi/io/gfa1_to_bed.py new file mode 100644 index 0000000..902037d --- /dev/null +++ b/exfi/io/gfa1_to_bed.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +"""exfi.io.gfa1_to_bed.py: submodule to read a GFA1 file and convert it to BED4 +""" + +import pandas as pd +import numpy as np + + +def gfa1_to_bed4(filename): + """Read a GFA1 file and convert it to BED4""" + + with open(filename, "r") as gfa: + containments = pd.DataFrame( + data=[ + x.strip().split("\t") for x in gfa.readlines() if x[0] == "C" + ], + columns=["RecordType", "Container", "ContainerOrient", "Contained", + "ContainedOrient", "Pos", "Overlap"], + dtype=None + )\ + .astype(dtype={"Pos": np.int}) + + containments = containments.rename({ + "Container": "chrom", + "Contained": "name" + }, axis=1) + containments["Overlap"] = containments["Overlap"]\ + .map(lambda x: int(x[:-1])) + containments["chromStart"] = containments["Pos"] + containments["chromEnd"] = containments["Pos"] + containments["Overlap"] + containments = containments[["chrom", "chromStart", "chromEnd", "name"]] + return containments diff --git a/exfi/io/gfa1_to_exons.py b/exfi/io/gfa1_to_exons.py deleted file mode 100644 index 3859b38..0000000 --- a/exfi/io/gfa1_to_exons.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.gfa1_to_exons.py: submodule to convert a GFA1 file into a fasta containing only the exons -""" - -import logging - -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord -from Bio.Seq import Seq - -from exfi.io import _coordinate_to_str -from exfi.io.read_gfa1 import read_gfa1 -from exfi.io.masking import _mask - - - -def gfa1_to_exons(gfa_in_fn: str, fasta_out_fn: str, masking: str = "none") -> None: - """Write the exons in FASTA format present in a GFA1 file - - :param str gfa_in_fn: Path to input GFA1 file - :param fasta_out_fn: Path to output FASTA FILE - :param masking: (Default value = "none") Type of masking to make. Options are "none", "soft" - and "hard". - """ - logging.info("Converting GFA1 file %s into exon fasta %s", gfa_in_fn, fasta_out_fn) - gfa1 = read_gfa1(gfa_in_fn) - - exon2sequence = gfa1["segments"] - exon2coordinates = gfa1["containments"] - link2overlap = gfa1["links"] - - # Mask if necessary - exon2sequence = _mask(exon2sequence, link2overlap, masking) - - # Add coordinate information to description - # Compose SeqRecord of each exon - logging.info("\tComposing SeqRecords") - sequences = [] - for exon_id, exon_sequence in exon2sequence.items(): - logging.debug("Processing %s", exon_id) - - # Compose coordinates - exon_coordinates = exon2coordinates[exon_id] - description = " ".join( - _coordinate_to_str(coordinate) - for coordinate in exon_coordinates - ) - sequences.append(SeqRecord( - id=exon_id, - description=description, - seq=Seq(exon_sequence) - )) - - # Write to fasta - logging.info("\tWriting fasta to file") - SeqIO.write(format="fasta", handle=fasta_out_fn, sequences=sequences) diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py new file mode 100644 index 0000000..fe2e44b --- /dev/null +++ b/exfi/io/gfa1_to_fasta.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +"""exfi.io.gfa1_to_exons.py: submodule to read a gfa1, extract the exons and +store it in fasta format""" + +import pandas as pd + +def gfa1_to_exons(fasta_out, gfa1_in): + """Extract the exons in Fasta format""" + with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: + segments = pd.DataFrame( + data=[ + x.strip().split("\t")[0:3] + for x in gfa.readlines() if x[0] == "S" + ], + columns=["RecordType", "Name", "Sequence"], + ) + segments["fasta"] = ">" + segments["Name"] + "\n" + segments["Sequence"] + segments.fasta.values.tofile(fasta, sep="\n", format="%s") + fasta.write("\n") # End line + + +def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): + """Convert a GFA1 file to a gapped transcript file""" + + with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: + + separator = gap_size * 'N' + + # Read only segments and paths + data = [ + x.strip().split("\t") + for x in gfa.readlines() if x[0] in set(["S", "P"]) + ] + + # Create {node_id: nucleotide} + node2sequence = pd.DataFrame( + data=[x[0:3] for x in data if x[0] == "S"], + columns=["RecordType", "Name", "Sequence"], + )\ + .drop(columns="RecordType")\ + .set_index("Name")\ + .to_dict()["Sequence"] + + # Get the path info + paths = pd.DataFrame( + data=[x[0:4] for x in data if x[0] == "P"], + columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] + )\ + .drop(columns=["RecordType", "Overlaps"]) + del data + + paths["SegmentNames"] = paths["SegmentNames"].str.replace("+", "") + + # Compose the sequence + paths["gapped_sequence"] = paths\ + .SegmentNames\ + .str.split(',')\ + .map(lambda x: separator.join([node2sequence[y] for y in x])) + del node2sequence + + # Create the fasta line + paths["fasta"] = \ + ">" + paths.PathName + " " + paths.SegmentNames + "\n" + \ + paths.gapped_sequence + + # Dump everything + paths.fasta.values.tofile(fasta, sep="\n", format="%s") + fasta.write("\n") # End line diff --git a/exfi/io/gfa1_to_gapped_transcripts.py b/exfi/io/gfa1_to_gapped_transcripts.py deleted file mode 100644 index 2d9e32f..0000000 --- a/exfi/io/gfa1_to_gapped_transcripts.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.gfa1_to_gapped_transcript.py: submodule to convert a GFA1 file to a fasta file where the -spaces between predicted exons are filled with a string of Ns. -""" - -import logging - -from Bio.SeqRecord import SeqRecord -from Bio.Seq import Seq -from Bio import SeqIO - -from exfi.io.masking import _mask -from exfi.io.read_gfa1 import read_gfa1 - -from exfi.classes import Node2Coordinates, Path2Nodes - -def _compose_paths( - exon_dict: Node2Coordinates, path_dict: Path2Nodes, number_of_ns: int = 100) -> SeqRecord: - """Compose and return each gapped transcript. - - :param exon_dict: dict of exons: {exon_id: ((seq1, start1, end1,), ...)}. - :param path_dict: dict of paths: {transcript1: (exon1, ..., exonN)}. - :param number_of_ns: number of Ns to write between each exon. - """ - logging.info("\tComposing paths") - chunk_of_ns = "N" * number_of_ns - for transcript_id, exon_list in sorted(path_dict.items()): - exon_seqs = [str(exon_dict[exon_id]) for exon_id in exon_list] - yield SeqRecord( - id=transcript_id, - description=",".join(exon_list), - seq=Seq(chunk_of_ns.join(exon_seqs)) - ) - - -def gfa1_to_gapped_transcripts( - gfa_in: str, fasta_out: str, number_of_ns: int = 100, masking: str = "none") -> None: - """Write gapped transcripts as fasta from GFA1 file - - :param str gfa_in: Path to input GFA1 file. - :param str fasta_out: Patho to output FASTA file. - :param int number_of_ns: Number of Ns to be written between each exon (Default value = 100). - :param str masking: Type of masking to be applied: none, soft, hard (Default value = "none"). - """ - - logging.info("Converting GFA1 file %s to gapped transcript fasta %s", gfa_in, fasta_out) - - # Process - gfa1 = read_gfa1(gfa_in) - exon_dict = gfa1["segments"] - overlap_dict = gfa1["links"] - path_dict = gfa1["paths"] - - - # Mask if necessary - exon_dict = _mask(exon_dict, overlap_dict, masking) - - composed_paths = _compose_paths(exon_dict, path_dict, number_of_ns) - - # Write - logging.info("\tWriting to file") - SeqIO.write(format="fasta", sequences=composed_paths, handle=fasta_out) diff --git a/exfi/io/gfa1_to_splice_graph_dict.py b/exfi/io/gfa1_to_splice_graph_dict.py deleted file mode 100644 index a2305c5..0000000 --- a/exfi/io/gfa1_to_splice_graph_dict.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.gfa1_to_splice_graph.py: submodule to convert a gfa1 file into a splice graph""" - -import logging - -from typing import Dict - -import networkx as nx - -from exfi.io.read_gfa1 import \ - read_gfa1 - -from exfi.classes import \ - Node2Coordinates, \ - Edge2Overlap, \ - SpliceGraph, \ - SpliceGraphDict - -def _split_node2coord( - node2coord: Node2Coordinates, node2transcript: Dict[str, str]) -> \ - Dict[str, Node2Coordinates]: - """Split the big node2coord dict into its subcomponents (transcripts) - - :param dict node2coord: dict of the shape key=node_id, value=((seq1, start1, node1), - ..., (seqN, startN, nodeN)) - :param dict node2transcript: dict of the shape node_id: transcript_id - """ - splitted_node2coord: Dict[str, Node2Coordinates] = { - key: Node2Coordinates() - for key in set(node2transcript.values()) - } - for node, coordinates in node2coord.items(): - transcript = node2transcript[node] - if node in splitted_node2coord[transcript]: - splitted_node2coord[transcript][node] += coordinates - else: - splitted_node2coord[transcript][node] = tuple(coordinates) - # if node not in splitted_node2coord[transcript]: - # splitted_node2coord[transcript][node] = tuple(Coordinate()) - # splitted_node2coord[transcript][node] += coordinates - return splitted_node2coord - - - -def _split_edge2overlap(edge2overlap: Edge2Overlap, node2transcript: Dict[str, str]) -> \ - Dict[str, Edge2Overlap]: - """Split the big edge2overlap dict into subcomponents (transcripts) - - :param dict edge2overlap: dict of the shape key=(node1, node2), value= overlap inbases between - node1 and node2. Positive value means overlap, negative value means gap of that size between - them - :param dict node2transcript: dict of the shape key=exon_id, value= transcripts to which it - belongs - """ - splitted_edge2overlap: Dict[str, Edge2Overlap] = { - transcript: Edge2Overlap() - for transcript in set(node2transcript.values()) - } - for edge, overlap in edge2overlap.items(): - transcript = node2transcript[edge[0]] - splitted_edge2overlap[transcript][edge] = overlap - return splitted_edge2overlap - - - -def gfa1_to_splice_graph_dict(handle: str) -> SpliceGraphDict: - """Read a GFA1 file and store the SpliceGraphDict - - :param str handle: Path to input GFA1 file - """ - logging.info("Converting gfa1 %s to splice graph", handle) - - # Read and process - logging.info("\tReading and processing GFA file %s", handle) - gfa1 = read_gfa1(handle) - node2coord = gfa1["containments"] - edge2overlap = gfa1["links"] - transcript2nodes = gfa1["paths"] - - # Revert path2nodes - node2transcript = { - value: key - for key, values in transcript2nodes.items() - for value in values - } - - # Split node2coord - transcript2node2coord = _split_node2coord(node2coord, node2transcript) - transcript2edge2overlap = _split_edge2overlap(edge2overlap, node2transcript) - - # Initialize - splice_graph_dict = SpliceGraphDict({ - transcript: SpliceGraph() - for transcript in transcript2nodes - }) - - # process - for transcript in splice_graph_dict.keys(): - splice_graph = SpliceGraph() - - node2coord = transcript2node2coord[transcript] - splice_graph. add_nodes_from(node2coord.keys()) - nx.set_node_attributes(G=splice_graph, name="coordinates", values=node2coord) - - edge2overlap = transcript2edge2overlap[transcript] - splice_graph.add_edges_from(edge2overlap.keys()) - nx.set_edge_attributes(G=splice_graph, name="overlaps", values=edge2overlap) - - splice_graph_dict[transcript] = splice_graph - - return splice_graph_dict diff --git a/exfi/io/masking.py b/exfi/io/masking.py index af98ba4..9f5ecf0 100644 --- a/exfi/io/masking.py +++ b/exfi/io/masking.py @@ -7,10 +7,6 @@ import logging -from exfi.classes import \ - FastaDict, \ - Edge2Overlap - def _process_overlap_cigar(cigar_string: str) -> list: """Process a simple CIGAR string. @@ -37,13 +33,13 @@ def _soft_mask_left(string: str, n_bases: int) -> str: return string[:n_bases].lower() + string[n_bases:] -def _soft_mask(exon_dict: FastaDict, overlap_dict: Edge2Overlap) -> FastaDict: +def _soft_mask(exon_dict, overlap_dict): """Soft mask all overlaps in the exon_dict. :param dict exon_dict: dict of exon_id: sequence :param dict overlap_dict: dict of (node1, node2): overlap """ - exon_dict = FastaDict(exon_dict.copy()) + exon_dict = exon_dict.copy() for (start, end), overlap in overlap_dict.items(): if overlap > 0: exon_dict[start] = _soft_mask_right(exon_dict[start], overlap) @@ -69,13 +65,13 @@ def _hard_mask_left(string: str, n_bases: int): return "N" * n_bases + string[n_bases:] -def _hard_mask(exon_dict: FastaDict, overlap_dict: Edge2Overlap) -> FastaDict: +def _hard_mask(exon_dict, overlap_dict): """Hard mask all overlaps in the exon_dict. :param dict exon_dict: Dict of the shape exon_id: sequence. :param dict overlap_dict: Dict of the shape (exon1, exon2): overlap between them. """ - exon_dict = FastaDict(exon_dict.copy()) + exon_dict = exon_dict.copy() for (start, end), overlap in overlap_dict.items(): if overlap > 0: exon_dict[start] = _hard_mask_right(exon_dict[start], overlap) @@ -83,7 +79,7 @@ def _hard_mask(exon_dict: FastaDict, overlap_dict: Edge2Overlap) -> FastaDict: return exon_dict -def _mask(exon_dict: FastaDict, overlap_dict: Edge2Overlap, masking: str = "none") -> FastaDict: +def _mask(exon_dict, overlap_dict, masking: str = "none"): """If any of the soft mask or hard mask are activated, mask :param dict exon_dict: Dict of the shape exon_id: sequence. diff --git a/exfi/io/read_bed3.py b/exfi/io/read_bed3.py new file mode 100644 index 0000000..e60ee9b --- /dev/null +++ b/exfi/io/read_bed3.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +"""exfi.io.read_bed3.py: BED3 importer""" + + +def read_bed3(filename): + """Read a BED file and return the BED3 dataframe.""" + import pandas as pd + import numpy as np + bed3 = pd.read_table( + filepath_or_buffer=filename, + header=None, + usecols=[0, 1, 2], + names=["chrom", "chromStart", "chromEnd"], + dtype={"chrom": np.str, "chromStart": np.int, "chromEnd": np.int}, + engine='c' + ) + return bed3 diff --git a/exfi/io/read_gfa1.py b/exfi/io/read_gfa1.py deleted file mode 100644 index ba9eadc..0000000 --- a/exfi/io/read_gfa1.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.read_gfa1: submodule to process a gfa1 into almost a splice graph""" - -import logging - -from typing import \ - List - -from exfi.classes import \ - FastaDict, \ - Node2Coordinates, \ - Edge2Overlap, \ - Path2Nodes, \ - Coordinate - - - -def _overlap_str_to_int(overlap_str: str) -> int: - """Modify overlap str to int: - - 20G -> -20 - 13M -> M - - :param str overlap_str: overlap string to process. - """ - if not isinstance(overlap_str, str): - raise TypeError("{overlap} is not str".format(overlap=overlap_str)) - letter = overlap_str[-1] - if letter == "M": - return int(overlap_str[:-1]) - elif letter == "G": - return -int(overlap_str[:-1]) - else: - raise ValueError("{letter} letter is not M or G".format(letter=letter)) - - -def _process_segments(segments_raw: List[List[str]]) -> FastaDict: - """Convert a list of segment lines in GFA1 format to dict - - ["S", node_id, sequence, *whatever] -> to a dict {node_id: sequence} - - :param list segments_raw: list of processed segment lines. - """ - logging.info("\tProcessing segments") - segments = FastaDict({ - line[1]: line[2] - for line in segments_raw - }) - return segments - - -def _process_links(links_raw: List[List[str]]) -> Edge2Overlap: - """Convert a list of Link lines in GFA1 format to dict: - - ["L", from, from_orient, to, to_ortient, overlap] to a dict {(from, to): overlap} - - :param list links_raw: list of processed link lines. - """ - logging.info("\tProcessing links") - # _, node_u, _, node_v, _, overlap, *_ = line - # links[(node_u, node_v)] = _overlap_str_to_int(overlap) - links = Edge2Overlap({ - (line[1], line[3]): _overlap_str_to_int(line[5]) - for line in links_raw - }) - return links - - -def _process_containments(containments_raw: List[List[str]]) -> Node2Coordinates: - """Convert a list of containments in GFA1 format to a dict - - ["C", transcript_id, _, node_id, _, position, overlap] to a dict - {node_id: ((transcript_id, start, end), )} - - :param list containments_raw: list of processed containment lines - - """ - logging.info("\tProcessing containments") - containments = Node2Coordinates() - for line in containments_raw: - _, container, _, contained, _, position, overlap = line - overlap_int = _overlap_str_to_int(overlap) - start = int(position) - end = start + overlap_int - if contained not in containments: - containments[contained] = (Coordinate(container, start, end), ) - else: - containments[contained] += (Coordinate(container, start, end), ) - return containments - - -def _process_paths(containments_raw: List[List[str]]) -> Path2Nodes: - """Convert a list of paths in GFA1 format to a dict - - ["P", transcript_id, node1+,...,nodeN+] to a dict {transcript_id: (node1,..., nodeN)} - - :param list containments_raw: list of processed path lines - """ - logging.info("\tProcessing paths") - paths = Path2Nodes() - for line in containments_raw: - _, path_name, segment_names_str = line - # Drop orientations! - segment_names_split = segment_names_str.split(",") - segment_names = tuple( - segment_name[:-1] # Drop the orientations - for segment_name in segment_names_split - ) - paths[path_name] = segment_names - return paths - - - -def _process_tsv_lines(line: str) -> List[str]: - """str -> tsv list""" - line = line.strip() - line_list = line.split("\t") - return line_list - - - -def read_gfa1(filename: str) -> dict: - """Process GFA1 file to an intermediate dict - - Result is a dict { - "header": header list,List[str] - "segments": segment list, - "links": link list, - "cointainments": containments list, - "paths": path list - } - - :param str filename: Path to GFA1 file. - """ - with open(filename, "r") as gfain: - - logging.info("Reading gfa1 %s", filename) - - segments: List[List[str]] = [] - links: List[List[str]] = [] - containments: List[List[str]] = [] - paths: List[List[str]] = [] - - for line_raw in gfain: - - line = _process_tsv_lines(line_raw) - - if line[0] == "H": - header = line - elif line[0] == "S": - segments.append(line) - elif line[0] == "L": - links.append(line) - elif line[0] == "C": - containments.append(line) - elif line[0] == "P": - paths.append(line) - - return { - "header": header[1], - "segments": _process_segments(segments), - "links": _process_links(links), - "containments": _process_containments(containments), - "paths": _process_paths(paths) - } diff --git a/exfi/io/splice_graph_dict_to_gfa1.py b/exfi/io/splice_graph_dict_to_gfa1.py deleted file mode 100644 index 060ed58..0000000 --- a/exfi/io/splice_graph_dict_to_gfa1.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 - -"""exfi.io.splice_graph_to_gfa1.py: functions to convert a splice graph into a GFA1 file""" - -import logging -from typing import Generator - -from itertools import chain - -import networkx as nx -import pandas as pd - -from natsort import \ - natsorted - -from exfi.classes import FastaDict, Node2Coordinates, Edge2Overlap, SpliceGraph, \ - SpliceGraphDict - -from exfi.io import \ - _coordinate_to_str - -from exfi.build_splice_graph_dict import \ - bed6df_to_path2node - - - - -def _get_node2coord(splice_graph: SpliceGraph) -> Node2Coordinates: - """Get node coordinates - - :param nx.DiGraph splice_graph: DiGraph from which to extract node coordinates. - """ - return nx.get_node_attributes(G=splice_graph, name="coordinates") - - -def _get_edge2overlap(splice_graph: SpliceGraph) -> Edge2Overlap: - """Get edge2overlap - - :param nx.DiGraph splice_graph: DiGraph from which to extract edge overlaps. - """ - return nx.get_edge_attributes(G=splice_graph, name="overlaps") - - -def _set_node2coord( - splice_graph: SpliceGraph, node2coord: Node2Coordinates) -> None: - """Set node to coordinates data in splice_graph - - :param nx.DiGraph splice_graph: DiGraph where to store the data. - :param dict node2coord: node coordinates to be stored. - """ - nx.set_node_attributes(G=splice_graph, name="coordinates", values=node2coord) - - -def _set_edge2overlap(splice_graph: SpliceGraph, edge2overlap: Edge2Overlap) -> None: - """Set edge to overlap data in splice_graph - - :param nx.DiGraph splice_graph: DiGraph where to store data - :param dict edge2overlap: edge to overlap data to store - """ - nx.set_edge_attributes(G=splice_graph, name="overlaps", values=edge2overlap) - - -def _compute_segments( - splice_graph_dict: SpliceGraphDict, transcriptome_dict: FastaDict) -> \ - Generator[str, None, None]: - """Compute the segment lines: S node_id sequence length - - :param dict splice_graph_dict: dict of the shape {component_id: nx.DiGraph} - :param dict transcriptome_dict: dict with the transcriptome FASTA - """ - logging.info("\tComputing segments") - for _, splice_graph in natsorted(splice_graph_dict.items()): - node2coords = _get_node2coord(splice_graph) - for node_id, coordinates in natsorted(node2coords.items()): - logging.debug("\t\tProcessing node %s", node_id) - coordinate = coordinates[0] - transcript_id, start, end = coordinate - sequence = str(transcriptome_dict[transcript_id][start:end]) - length = len(sequence) - yield f"S\t{node_id}\t{sequence}\tLN:i:{length}\n" - - -def _compute_links(splice_graph_dict: SpliceGraphDict) -> Generator[str, None, None]: - """Compute the link lines: L start orientation end orientation overlap - - :param dict splice_graph_dict: dict of DiGraphs with the splice graph - """ - logging.info("\tComputing links") - - for _, splice_graph in natsorted(splice_graph_dict.items()): - - # Edges - edge2overlap = _get_edge2overlap(splice_graph) - - for (node1, node2), overlap in natsorted(edge2overlap.items()): - logging.debug("\t\tProcesssing edge (%s, %s)", node1, node2) - # is an overlap or a gap - if overlap >= 0: - overlap = "{}M".format(overlap) - else: - overlap = "{}G".format(-overlap) - yield f"L\t{node1}\t+\t{node2}\t+\t{overlap}\n" - - -def _compute_containments(splice_graph_dict: SpliceGraphDict) -> Generator[str, None, None]: - """Compute the containment lines (w.r.t. transcriptome) - - C container orientation contained orientation position overlap - - :param dict splice_graph_dict: dict of DiGraph representing the splice graph. - """ - # Extract from the graph necessary data - logging.info("\tComputing containments") - - for _, splice_graph in natsorted(splice_graph_dict.items()): - node2coordinates = _get_node2coord(splice_graph) - - for node, coordinates in natsorted(node2coordinates.items()): - for (transcript_id, start, end) in coordinates: - logging.debug( - "\t\tProcessing node %s with coordinates %s:%s-%s", - node, transcript_id, start, end - ) - cigar = str(int(end) - int(start)) + "M" - yield f"C\t{transcript_id}\t+\t{node}\t+\t{start}\t{cigar}\n" - - -def _compute_paths(splice_graph_dict: SpliceGraphDict) -> Generator[str, None, None]: - """Compute the paths in the splice graph: P transcript_id [node1, ..., nodeN] - - :param splice_graph_dict: Dict of splice graphs. - """ - logging.info("\tComputing paths") - - # Transform all the coordinates to a bed6 dataframe - bed6_records = ( - tuple(list(value[0]) + [_coordinate_to_str(value[0]), ".", "+"]) - for splice_graph in splice_graph_dict.values() - for value in _get_node2coord(splice_graph).values() - ) - bed6df = pd.DataFrame( - data=bed6_records, - columns=["chrom", "start", "end", "name", "score", "strand"] - ) - - path2nodes = bed6df_to_path2node(bed6df) - for transcript_id, path in natsorted(path2nodes.items()): - path = ",".join([node + "+" for node in path]) - yield f"P\t{transcript_id}\t{path}\n" - - -def splice_graph_dict_to_gfa1( - splice_graph_dict: SpliceGraphDict, - transcriptome_dict: FastaDict, - filename: str) \ - -> None: - """Write splice graph to filename in GFA 1 format - - :param splice_graph_dict: Dict of Splice Graphs. - :param transcriptome_dict: Dict of Sequences. - :param filename: Ouptut filename. - """ - logging.info("Writing splice graph to GFA1 file %s", filename) - header = ["H\tVN:Z:1.0\n"] - segments = _compute_segments(splice_graph_dict, transcriptome_dict) - links = _compute_links(splice_graph_dict) - containments = _compute_containments(splice_graph_dict) - paths = _compute_paths(splice_graph_dict) - with open(filename, "w") as gfa1_out: - gfa1_out.writelines(chain( - header, segments, links, containments, paths - )) diff --git a/exfi/polish.py b/exfi/polish.py old mode 100644 new mode 100755 index 7aef2ac..5efb07c --- a/exfi/polish.py +++ b/exfi/polish.py @@ -1,175 +1,93 @@ -#!/usr/bin/env python3 +"""exfi.polish.py -"""exfi.polish_overlaps: submodule to polish the overlaps between two exons""" +exfi submodule to polish a bed4 dataframe by checking if in the overlap between +two exons there is the AG-GT splicing signal. +""" -from typing import Iterable - -import networkx as nx - - -import pathos.multiprocessing as mp - -from exfi.io import \ - _coordinate_to_str - -from exfi.classes import Coordinate, SpliceGraph, SpliceGraphDict, FastaDict - - - -def trim_end(coordinate: Coordinate, bases: int) -> Coordinate: - """Trim bases to the end of the coordinate - - :param tuple coordinate: BED3 record - :param int bases: Number of bases to trim form end +def polish_bed4(bed4, transcriptome_dict): """ - return Coordinate(coordinate[0], coordinate[1], coordinate[2] - bases) - - - -def trim_start(coordinate: Coordinate, bases: int) -> Coordinate: - """Trim bases to the start of the coordinate - - :param tuple coordinate: BED3 record. - :param int bases: Number of bases to trim from start. + Trim overlapping exons according to the AG-GT signal. """ - return Coordinate(coordinate[0], coordinate[1] + bases, coordinate[2]) + polished = bed4.copy() + # Get the transcript_id of the next exon + polished["chromNext"] = polished["chrom"].shift(-1) + # Get the name of the next exon + polished["nameNext"] = polished["name"].shift(-1) -def trim_multiple_ends(iterable_coordinate: Iterable[Coordinate], bases: int) \ - -> Iterable[Coordinate]: - """Trim bases at the end of all elements in iterable_coordinate + # Get the start of the next exon + polished["chromStartNext"] = polished["chromStart"].shift(-1) - :param tuple iterable_coordinate: iterable of bed3 records. - :param int bases: number of bases to trim from end. - """ - return tuple(trim_end(coordinate, bases) for coordinate in iterable_coordinate) + # Get the end of the next exon + polished["chromEndNext"] = polished["chromEnd"].shift(-1) + # Remove rows with different transcripts + polished = polished\ + [polished["chrom"] == polished["chromNext"]] + # cast from float to int + polished = polished.astype({"chromStartNext": int, "chromEndNext": int}) -def trim_multiple_starts(iterable_coordinate: Iterable[Coordinate], bases: int) \ - -> Iterable[Coordinate]: - """Trim bases at the start of all elements in iterable_coordinate + # compute the overlap + polished["overlap"] = polished["chromEnd"] - polished["chromStartNext"] - :param tuple iterable_coordinate: iterable of bed3 records. - :param int bases: Number of bases to trim from start. + # Throw away lines that cannot be polished + polished = polished[polished.overlap >= 4] - """ - return tuple(trim_start(coordinate, bases) for coordinate in iterable_coordinate) + # Get the entire transcript sequence + polished["sequence"] = polished.chrom.map(transcriptome_dict) + # Prepare a column with the data required to extract the overlapping seq + polished["data_to_map"] = list(zip( + polished.sequence, + polished.chromStartNext + 1, + polished.chromEnd + 1 + )) + # Get the overlapping sequence + polished["overlap_str"] = polished\ + .data_to_map\ + .map(lambda x: x[0][x[1]:x[2]]) -def polish_splice_graph(splice_graph: SpliceGraph, fasta_dict: FastaDict) -> SpliceGraph: - """Trim overlaps according to the AG/GT signal (AC/CT in the reverse strand) + # Get the position in which the AGGT happens + polished["overlap_index"] = polished["overlap_str"].str.rfind("AGGT") - :param nx.DiGraph splice_graph: SpliceGraph to polish. - :param dict fasta_dict: FastaDict of transcriptome. - """ + # Throw away rows in which AGGT doesn't happen + polished = polished[polished.overlap_index >= 0] - node2coordinates = nx.get_node_attributes(G=splice_graph, name="coordinates") - edge2overlap = nx.get_edge_attributes(G=splice_graph, name="overlaps") - node_mapping = {node: node for node in splice_graph.nodes()} # old: new - - for (node_u, node_v), overlap in edge2overlap.items(): - - if overlap >= 4: - - # Get one of the coordinates (there should be one) - node_u_coord = node2coordinates[node_u][0] - node_v_coord = node2coordinates[node_v][0] - - # Get overlapping thing - overlap_seq = fasta_dict[node_u_coord[0]][node_v_coord[1]:node_u_coord[2]] - - # When there is an overlap, - # Exon structure should be EXON...AG - GT...intron...AG - GT...exon - if "AGGT" in overlap_seq: - - index = overlap_seq.rfind("AGGT") - - # rename both transcripts - ## u: delete overlap untul AG - ## v: delete overlap until GT - new_node_u = _coordinate_to_str(trim_end(node_u_coord, overlap - index - 2)) - new_node_v = _coordinate_to_str(trim_start(node_v_coord, index + 2)) - - # Update old -> new renaming - node_mapping[node_u] = new_node_u - node_mapping[node_v] = new_node_v - - # change coordinate dict values - ## u - node2coordinates[node_u] = trim_multiple_ends( - iterable_coordinate=node2coordinates[node_u], bases=overlap - index - 2 - ) - ## v - node2coordinates[node_v] = trim_multiple_starts( - iterable_coordinate=node2coordinates[node_v], bases=index + 2 - ) - - # change overlap dict values - edge2overlap[(node_u, node_v)] = 0 - - # else: - # merge nodes into one - # Leave as it is? - - # rename nodes in graph - splice_graph = nx.relabel_nodes(G=splice_graph, mapping=node_mapping) - - # assign attributes - ## Nodes - nx.set_node_attributes( - G=splice_graph, - name="coordinates", - values={ - node_mapping[node]: coordinates - for node, coordinates in node2coordinates.items() - } - ) - ## Edges - nx.set_edge_attributes( - G=splice_graph, - name="overlaps", - values={ - (node_mapping[node_u], node_mapping[node_v]): overlap - for (node_u, node_v), overlap in edge2overlap.items() - } - ) - - return splice_graph - - - -def polish_splice_graph_dict( - splice_graph_dict: SpliceGraphDict, fasta_dict: FastaDict, args: dict) -> SpliceGraphDict: - """Polish all overlaps in a splice graph dict - - :param dict splice_graph_dict: SpliceGraphDict to polish. - :param dict fasta_dict: FastaDict of transcriptome. - :param dict args: Dict of arguments for processing. - - args must at least be {"threads": 1} - """ + # Correct positions + polished["chromEndCorrected"] = polished["chromEnd"] - 2 + polished["chromStartNextCorrected"] = \ + polished["chromStartNext"] + polished["overlap_index"] + 2 + + # Organize the elements to correct + ends_to_change = polished\ + [["name", "chromEndCorrected"]]\ + .rename({"chromEndCorrected": "chromEnd"}, axis=1)\ + .set_index("name") - # Initialize pool of workers - pool = mp.Pool(args["threads"], maxtasksperchild=1) + starts_to_change = polished\ + [["nameNext", "chromStartNextCorrected"]]\ + .rename( + {"nameNext": "name", "chromStartNextCorrected": "chromStart"}, + axis=1 + )\ + .set_index("name") - splice_graphs = (splice_graph for splice_graph in splice_graph_dict.values()) - fasta_dicts = ( - {transcript_id: fasta_dict[transcript_id]} for transcript_id in splice_graph_dict.keys() - ) - results = pool.starmap( - polish_splice_graph, - zip(splice_graphs, fasta_dicts), - chunksize=1000 # Number of splice_graphs to process at once - ) - pool.close() - pool.join() + bed4_new = bed4.set_index("name") + # Correct the starts + bed4_new.loc[starts_to_change.index.tolist()].chromStart = \ + starts_to_change.chromStart + # Correct the ends + bed4_new.loc[ends_to_change.index.tolist()].chromEnd = \ + ends_to_change.chromEnd - # Add results to splice_graph_dict - for i, transcript in enumerate(splice_graph_dict.keys()): - splice_graph_dict[transcript] = results[i] + bed4_new = bed4_new.reset_index(drop=False) + bed4_new["name"] = \ + bed4_new.chrom + ":" + \ + bed4_new.chromStart.map(str) + "-" + \ + bed4_new.chromEnd.map(str) - return splice_graph_dict + return bed4_new diff --git a/scripts/pre-commit b/scripts/pre-commit new file mode 100755 index 0000000..2eb2f96 --- /dev/null +++ b/scripts/pre-commit @@ -0,0 +1,37 @@ +#!/bin/bash + +do_nosetests() { + nosetests --verbose --with-coverage --cover-package=exfi --cover-xml +} + +do_sphinx() { + sphinx-apidoc -o docs/source . + sphinx-build docs/source docs/build +} + +fail () { + echo "$@: [FAILED]" + exit 1 +} + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 +fi + +echo "checking pylint of package" +pylint --jobs 0 exfi || fail pylint-exfi + +echo "checking unit tests with nosetests" +do_nosetests nosetests || fail nosetests +echo "checking pylint of tests" +pylint --jobs 0 tests || fail pylint-tests + +#echo "Building Sphinx documentation" +#do_sphinx || fail sphinx + +# If there are whitespace errors, print the offending file names and fail. +exec git diff-index --check --cached $against -- diff --git a/setup.py b/setup.py index 18d5b9a..d75ccf7 100644 --- a/setup.py +++ b/setup.py @@ -20,8 +20,6 @@ 'Biopython', 'networkx', 'pandas', - 'natsort', - 'pathos', ], long_description=open('README.md').read(), test_suite='nose.collector', diff --git a/tests/auxiliary_functions.py b/tests/auxiliary_functions.py index a459e77..2d155d0 100644 --- a/tests/auxiliary_functions.py +++ b/tests/auxiliary_functions.py @@ -4,13 +4,15 @@ Auxiliary functions and classes for testing """ -from typing import Iterable, List, Tuple import tempfile import shutil from subprocess import \ Popen, PIPE +import numpy as np +import pandas as pd + from Bio.SeqIO.FastaIO import SimpleFastaParser from exfi.find_exons import \ @@ -21,32 +23,28 @@ from exfi.build_baited_bloom_filter import \ _get_build_bf_command -from exfi.classes import FastaDict, Coordinate - -def _command_to_list(command: List[str]) -> List[Coordinate]: +def _command_to_list(command): """Execute command and return output as list of strings""" process = Popen(command, stdout=PIPE, shell=False) - results = list(_process_output(process)) + results = _process_output(process) return results -def _fasta_to_list(filename: str) -> List[Tuple[str, str]]: +def _fasta_to_list(filename): """fasta to list with SimpleFastaParser""" with open(filename, "r") as handle: return [record for record in SimpleFastaParser(handle)] -def _getfasta_to_list( - transcriptome_dict: FastaDict, iterable_of_bed: Iterable[Coordinate]) \ - -> List[Tuple[str, str]]: +def _getfasta_to_list(transcriptome_dict, iterable_of_bed): """Convert to a list the generator from getfasta""" return list(_get_fasta(transcriptome_dict, iterable_of_bed)) -def _silent_popen(command: List[str]) -> Popen: +def _silent_popen(command): """Create a Popen with no stderr and stdout""" return Popen( command, @@ -57,7 +55,7 @@ def _silent_popen(command: List[str]) -> Popen: -def _bf_and_process(reads_fns: List[str], transcriptome_fn: str) -> List[Coordinate]: +def _bf_and_process(reads_fns, transcriptome_fn): """(list of str, str) -> list Build the BF and process the reads @@ -80,4 +78,11 @@ def _bf_and_process(reads_fns: List[str], transcriptome_fn: str) -> List[Coordin process.wait() results = _find_exons_pipeline(args) shutil.rmtree(tmp_dir) - return list(results) + bed3 = pd.DataFrame( + data=results, + columns=["chrom", "chromStart", "chromEnd"] + ) + + bed3.chromStart.astype(np.int64) + bed3.chromEnd.astype(np.int64) + return bed3 diff --git a/tests/custom_assertions.py b/tests/custom_assertions.py index beb7e69..acfb374 100644 --- a/tests/custom_assertions.py +++ b/tests/custom_assertions.py @@ -20,8 +20,6 @@ from Bio.SeqRecord import SeqRecord -from exfi.classes import SpliceGraph, SpliceGraphDict - def check_same_keys(dict1: dict, dict2: dict) -> None: """Check if two dicts have the exact same keys""" if set(dict1.keys()) != set(dict2.keys()): @@ -49,7 +47,7 @@ def check_same_dict(dict1: dict, dict2: dict) -> None: -def check_equal_node2coord(sg1: SpliceGraph, sg2: SpliceGraph) -> None: +def check_equal_node2coord(sg1: dict, sg2: dict) -> None: """Check if two splice graphs have the same node2coord dicts""" node2coord1 = nx.get_node_attributes(G=sg1, name="coordinates") node2coord2 = nx.get_node_attributes(G=sg2, name="coordinates") @@ -57,7 +55,7 @@ def check_equal_node2coord(sg1: SpliceGraph, sg2: SpliceGraph) -> None: -def check_equal_edge2overlap(sg1: SpliceGraph, sg2: SpliceGraph) -> None: +def check_equal_edge2overlap(sg1: dict, sg2: dict) -> None: """Check if two splice graphs have the same node2coord dicts""" edge2overlap1 = nx.get_edge_attributes(G=sg1, name="overlaps") edge2overlap2 = nx.get_edge_attributes(G=sg2, name="overlaps") @@ -77,7 +75,7 @@ def check_equal_df_dict_values(dict1: dict, dict2: dict) -> None: -def check_equal_splice_graphs(sg1: SpliceGraph, sg2: SpliceGraph) -> None: +def check_equal_splice_graphs(sg1: dict, sg2: dict) -> None: """Check if two splice graphs are: - isomorphic - node2coord are equal @@ -156,7 +154,7 @@ def assertEqualListOfSeqrecords( @classmethod - def assertEqualSpliceGraphs(self, sg1: SpliceGraph, sg2: SpliceGraph) -> None: + def assertEqualSpliceGraphs(self, sg1: dict, sg2: dict) -> None: """Check if two splice graph are equal:""" # pylint: disable=invalid-name,bad-classmethod-argument check_equal_splice_graphs(sg1, sg2) @@ -173,7 +171,7 @@ def assertEqualDictOfDF( @classmethod - def assertEqualDictOfSpliceGraphs(self, dict1: SpliceGraphDict, dict2: SpliceGraphDict) -> None: + def assertEqualDictOfSpliceGraphs(self, dict1: dict, dict2: dict) -> None: """Check if two dicts of nx.DiGraph and some data attached to nodes and edges are equal""" # pylint: disable=invalid-name, bad-classmethod-argument check_equal_dict_of_sg(dict1, dict2) diff --git a/tests/io/complex.bed b/tests/io/complex.bed new file mode 100644 index 0000000..293ec1a --- /dev/null +++ b/tests/io/complex.bed @@ -0,0 +1,15 @@ +ENSDART00000161035.1 0 326 +ENSDART00000161035.1 397 472 +ENSDART00000161035.1 477 523 +ENSDART00000165342.1 5 127 +ENSDART00000165342.1 125 304 +ENSDART00000165342.1 317 460 +ENSDART00000165342.1 459 592 +ENSDART00000165342.1 591 650 +ENSDART00000165342.1 645 746 +ENSDART00000165342.1 746 851 +ENSDART00000165342.1 854 886 +ENSDART00000165342.1 899 953 +ENSDART00000165342.1 974 1097 +ENSDART00000165342.1 1098 1175 +ENSDART00000165342.1 1176 1324 diff --git a/tests/io/empty.bed b/tests/io/empty.bed new file mode 100644 index 0000000..e69de29 diff --git a/tests/io/simple.bed b/tests/io/simple.bed new file mode 100644 index 0000000..e5f86b0 --- /dev/null +++ b/tests/io/simple.bed @@ -0,0 +1 @@ +ENSDART00000161035.1 0 326 diff --git a/tests/test_collapse.py b/tests/test_collapse.py deleted file mode 100644 index 1383a00..0000000 --- a/tests/test_collapse.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.collapse -""" - -from typing import Tuple, Dict - -from unittest import TestCase, main - -import networkx as nx - -from exfi.classes import Coordinate, FastaDict, SpliceGraph, Edge2Overlap - -from exfi.io.fasta_to_dict import \ - fasta_to_dict - -from exfi.collapse import \ - _compute_seq2node, \ - _compute_old2new, \ - _compute_new_node2coord, \ - _compute_new_link2overlap, \ - collapse_splice_graph_dict - -from tests.custom_assertions import \ - CustomAssertions - -from tests.data import \ - NODE2COORDS_EMPTY, NODE2COORDS_SIMPLE, NODE2COORDS_COMPLEX, \ - OVERLAPS_EMPTY, OVERLAPS_SIMPLE, OVERLAPS_COMPLEX, \ - TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, TRANSCRIPTOME_COMPLEX_DICT, \ - SPLICE_GRAPH_EMPTY_DICT, SPLICE_GRAPH_SIMPLE_DICT, SPLICE_GRAPH_COMPLEX_DICT - - -def fasta_to_seq2node(fasta_dict: FastaDict) -> Dict[str, Tuple[str, ...]]: - """Revert fasta_dict: seq: seq_id""" - return {value: (key,) for key, value in fasta_dict.items()} - -SEQ2NODE_EMPTY = fasta_to_seq2node(fasta_to_dict("/dev/null")) -SEQ2NODE_SIMPLE = fasta_to_seq2node(fasta_to_dict("tests/io/exons_simple.fa")) -SEQ2NODE_COMPLEX = fasta_to_seq2node(fasta_to_dict("tests/io/exons_complex.fa")) - -OLD2NEW_EMPTY: Dict[str, str] = {} -OLD2NEW_SIMPLE = { - 'ENSDART00000161035.1:0-326': 'exon_00000000' -} -OLD2NEW_COMPLEX = dict(zip( - fasta_to_dict("tests/io/exons_complex.fa").keys(), - ["exon_{exon_number:08d}".format(exon_number=i) for i in range(len(SEQ2NODE_COMPLEX))] -)) - -NEW_NODE2COORD_EMPTY: Dict[str, Tuple[Coordinate, ...]] = {} -NEW_NODE2COORD_SIMPLE = { - 'exon_00000000': (('ENSDART00000161035.1', 0, 326),) -} -NEW_NODE2COORD_COMPLEX = dict(zip( - OLD2NEW_COMPLEX.values(), - NODE2COORDS_COMPLEX.values() -)) - - -NEW_LINK2OVERLAP_EMPTY = Edge2Overlap() -NEW_LINK2OVERLAP_SIMPLE = Edge2Overlap() -NEW_LINK2OVERLAP_COMPLEX = Edge2Overlap({ - ('exon_00000000', 'exon_00000001'): -71, - ('exon_00000001', 'exon_00000002'): -5, - ('exon_00000003', 'exon_00000004'): 2, - ('exon_00000004', 'exon_00000005'): -13, - ('exon_00000005', 'exon_00000006'): 1, - ('exon_00000006', 'exon_00000007'): 1, - ('exon_00000007', 'exon_00000008'): 5, - ('exon_00000008', 'exon_00000009'): 0, - ('exon_00000009', 'exon_00000010'): -3, - ('exon_00000010', 'exon_00000011'): -13, - ('exon_00000011', 'exon_00000012'): -21, - ('exon_00000012', 'exon_00000013'): -1, - ('exon_00000013', 'exon_00000014'): -1 -}) - - -COLLAPSED_EMPTY = SpliceGraph() - -COLLAPSED_SIMPLE = SpliceGraph() -COLLAPSED_SIMPLE.add_nodes_from(NEW_NODE2COORD_SIMPLE.keys()) -COLLAPSED_SIMPLE.add_edges_from(NEW_LINK2OVERLAP_SIMPLE.keys()) -nx.set_node_attributes(G=COLLAPSED_SIMPLE, name="coordinates", values=NEW_NODE2COORD_SIMPLE) -nx.set_edge_attributes(G=COLLAPSED_SIMPLE, name="overlaps", values=NEW_LINK2OVERLAP_SIMPLE) - -COLLAPSED_COMPLEX = SpliceGraph() -COLLAPSED_COMPLEX.add_nodes_from(NEW_NODE2COORD_COMPLEX.keys()) -COLLAPSED_COMPLEX.add_edges_from(NEW_LINK2OVERLAP_COMPLEX.keys()) -nx.set_node_attributes(G=COLLAPSED_COMPLEX, name="coordinates", values=NEW_NODE2COORD_COMPLEX) -nx.set_edge_attributes(G=COLLAPSED_COMPLEX, name="overlaps", values=NEW_LINK2OVERLAP_COMPLEX) - - - -class TestComputeSeq2Node(TestCase, CustomAssertions): - """Tests for exfi.collapse._compute_seq2node""" - - def test_empty(self): - """exfi.collapse._compute_seq2node: empty case""" - actual = _compute_seq2node(NODE2COORDS_EMPTY, TRANSCRIPTOME_EMPTY_DICT) - expected = SEQ2NODE_EMPTY - self.assertEqualDict(actual, expected) - - def test_simple(self): - """exfi.collapse._compute_seq2node: simple case""" - actual = _compute_seq2node(NODE2COORDS_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) - expected = SEQ2NODE_SIMPLE - self.assertEqualDict(actual, expected) - - def test_complex(self): - """exfi.collapse._compute_seq2node: complex case""" - actual = _compute_seq2node(NODE2COORDS_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) - expected = SEQ2NODE_COMPLEX - self.assertEqualDict(actual, expected) - - - -class TestComputeOld2New(TestCase, CustomAssertions): - """Tests for exfi.collapse._compute_old2new""" - - def test_empty(self): - """exfi.collapse._compute_old2: empty case""" - actual = _compute_old2new(SEQ2NODE_EMPTY) - expected = OLD2NEW_EMPTY - self.assertEqualDict(actual, expected) - - def test_simple(self): - """exfi.collapse._compute_old2new: simple case""" - actual = _compute_old2new(SEQ2NODE_SIMPLE) - expected = OLD2NEW_SIMPLE - self.assertEqualDict(actual, expected) - - def test_complex(self): - """exfi.collapse._compute_old2new: complex case""" - actual = _compute_old2new(SEQ2NODE_COMPLEX) - expected = OLD2NEW_COMPLEX - for key, value in actual.items(): - print(key, value) - for key, value in expected.items(): - print(key, value) - self.assertEqualDict(actual, expected) - - - -class TestComputeNewNode2Coord(TestCase, CustomAssertions): - """Tests for exfi.collapse._compute_new_node2coord""" - - def test_empty(self): - """exfi.collapse._compute_new_node2coord: empty case""" - actual = _compute_new_node2coord(OLD2NEW_EMPTY, NODE2COORDS_EMPTY) - expected = NEW_NODE2COORD_EMPTY - self.assertEqualDict(actual, expected) - - def test_simple(self): - """exfi.collapse._compute_new_node2coord: simple case""" - actual = _compute_new_node2coord(OLD2NEW_SIMPLE, NODE2COORDS_SIMPLE) - expected = NEW_NODE2COORD_SIMPLE - self.assertEqualDict(actual, expected) - - def test_complex(self): - """exfi.collapse._compute_new_node2coord: complex case""" - actual = _compute_new_node2coord(OLD2NEW_COMPLEX, NODE2COORDS_COMPLEX) - expected = NEW_NODE2COORD_COMPLEX - self.assertEqualDict(actual, expected) - - - -class TestComputeNewLink2Overlap(TestCase, CustomAssertions): - """Tests for exfi.collapse._compute_new_link2overlap""" - - def test_empty(self): - """exfi.collapse._compute_new_link2overlap: empty case""" - actual = _compute_new_link2overlap(OLD2NEW_EMPTY, OVERLAPS_EMPTY) - expected = NEW_LINK2OVERLAP_EMPTY - self.assertEqualDict(actual, expected) - - def test_simple(self): - """exfi.collapse._compute_new_link2overlap: simple case""" - actual = _compute_new_link2overlap(OLD2NEW_SIMPLE, OVERLAPS_SIMPLE) - expected = NEW_LINK2OVERLAP_SIMPLE - self.assertEqualDict(actual, expected) - - def test_complex(self): - """exfi.collapse._compute_new_link2overlap: complex case""" - actual = _compute_new_link2overlap(OLD2NEW_COMPLEX, OVERLAPS_COMPLEX) - expected = NEW_LINK2OVERLAP_COMPLEX - self.assertEqualDict(actual, expected) - - - -class TestCollapseSpliceGraph(TestCase, CustomAssertions): - """Tests for exfi.collapse_dict.collapse_splice_graph_dict""" - - def test_empty(self): - """exfi.collapse.collapse_splice_graph: empty case""" - actual = collapse_splice_graph_dict(SPLICE_GRAPH_EMPTY_DICT, TRANSCRIPTOME_EMPTY_DICT) - expected = COLLAPSED_EMPTY - self.assertEqualSpliceGraphs(actual, expected) - - def test_simple(self): - """exfi.collapse.collapse_splice_graph: simple case""" - actual = collapse_splice_graph_dict(SPLICE_GRAPH_SIMPLE_DICT, TRANSCRIPTOME_SIMPLE_DICT) - expected = COLLAPSED_SIMPLE - self.assertEqualSpliceGraphs(actual, expected) - - def test_complex(self): - """exfi.collapse.collapse_splice_graph: complex case""" - actual = collapse_splice_graph_dict(SPLICE_GRAPH_COMPLEX_DICT, TRANSCRIPTOME_COMPLEX_DICT) - expected = COLLAPSED_COMPLEX - self.assertEqualSpliceGraphs(actual, expected) - - - -if __name__ == "__main__": - main() diff --git a/tests/test_find_exons.py b/tests/test_find_exons.py index 3b3b943..d2df1ca 100644 --- a/tests/test_find_exons.py +++ b/tests/test_find_exons.py @@ -7,6 +7,9 @@ import unittest +import numpy as np +import pandas as pd + from exfi.io.fasta_to_dict import \ fasta_to_dict @@ -23,6 +26,25 @@ BED3RECORDS_EMPTY, BED3RECORDS_SIMPLE, BED3RECORDS_COMPLEX, \ BED3RECORDS_EMPTY_FN, BED3RECORDS_SIMPLE_FN, BED3RECORDS_COMPLEX_FN +def create_bed_from_lists(lists): + """tests.find_exons_pipeline.create_bed_from_lists: convert list of lists + to a BED3 dataframe""" + bed3 = pd.DataFrame( + data=lists, + columns=["chrom", "chromStart", "chromEnd"] + ) + + bed3.chromStart.astype(np.int64) + bed3.chromEnd.astype(np.int64) + return bed3 + + + +BED3DF_EMPTY = create_bed_from_lists(BED3RECORDS_EMPTY) +BED3DF_SIMPLE = create_bed_from_lists(BED3RECORDS_SIMPLE) +BED3DF_COMPLEX = create_bed_from_lists(BED3RECORDS_COMPLEX) + + class TestProcessOutput(unittest.TestCase): """Tests for _command_to_list""" @@ -30,17 +52,22 @@ class TestProcessOutput(unittest.TestCase): def test_empty_process(self): """exfi.find_exons._command_to_list: process an empty stream""" results = _command_to_list(["cat", BED3RECORDS_EMPTY_FN]) - self.assertEqual(first=results, second=BED3RECORDS_EMPTY) + self.assertTrue(results.shape == (0, 3)) + def test_simple_process(self): """exfi.find_exons._command_to_list: process an simple stream""" results = _command_to_list(["cat", BED3RECORDS_SIMPLE_FN]) - self.assertEqual(first=results, second=BED3RECORDS_SIMPLE) + print("Observed:\n", results) + print("Expected:\n", BED3DF_SIMPLE) + self.assertTrue(results.equals(BED3DF_SIMPLE)) def test_big_process(self): """exfi.find_exons._command_to_list: process an big stream""" results = _command_to_list(["cat", BED3RECORDS_COMPLEX_FN]) - self.assertEqual(first=results, second=BED3RECORDS_COMPLEX) + print("Observed:\n", results, results.dtypes) + print("Expected:\n", BED3DF_COMPLEX, BED3DF_COMPLEX.dtypes) + self.assertTrue(results.equals(BED3DF_COMPLEX)) @@ -48,7 +75,8 @@ class TestGetFastaToList(unittest.TestCase, CustomAssertions): """Tests for _get_fasta_to_list""" def test_empty_sequence_empty_bed(self): - """exfi.find_exons._getfasta_to_list: process an empty fasta and an empty bed""" + """exfi.find_exons._getfasta_to_list: process an empty fasta and an + empty bed""" transcriptome_dict = {} iterable_of_bed = [] self.assertEqual( @@ -57,7 +85,8 @@ def test_empty_sequence_empty_bed(self): ) def test_empty_sequence_one_bed(self): - """exfi.find_exons._getfasta_to_list: process an empty fasta and an empty bed""" + """exfi.find_exons._getfasta_to_list: process an empty fasta and an + empty bed""" transcriptome_dict = {} iterable_of_bed = [("test1", 14, 27)] self.assertEqual( @@ -66,7 +95,8 @@ def test_empty_sequence_one_bed(self): ) def test_one_sequence_empty_bed(self): - """exfi.find_exons._getfasta_to_list: process a simple fasta and an empty bed""" + """exfi.find_exons._getfasta_to_list: process a simple fasta and an + empty bed""" transcriptome_dict = fasta_to_dict( "tests/find_exons/single_sequence.fa" ) @@ -77,7 +107,8 @@ def test_one_sequence_empty_bed(self): ) def test_one_sequence_one_bed(self): - """exfi.find_exons._getfasta_to_list: process an single fasta and a single bed record""" + """exfi.find_exons._getfasta_to_list: process an single fasta and a + single bed record""" transcriptome_dict = fasta_to_dict( "tests/find_exons/one_sequence_one_bed_input.fa" ) @@ -109,28 +140,33 @@ class TestFindExonsPipeline(unittest.TestCase): """Tests for find_exons_pipeline""" def test_notranscriptome_noreads(self): - """exfi.find_exons._bf_and_process: Process an empty transcriptome and an empty BF""" + """exfi.find_exons._bf_and_process: Process an empty transcriptome and + an empty BF""" reads_fns = ["/dev/null"] transcriptome_fn = "/dev/null" results = _bf_and_process(reads_fns, transcriptome_fn) - self.assertEqual(results, []) + print("Observed:\n", results) + print("Expected:\n", BED3DF_EMPTY) + self.assertEqual(results.shape, (0, 3)) def test_transcriptome_noreads(self): - """exfi.find_exons._bf_and_process: Process a small transcriptome and an empty BF""" + """exfi.find_exons._bf_and_process: Process a small transcriptome and + an empty BF""" reads_fns = ["/dev/null"] transcriptome_fn = 'tests/find_exons/small_transcriptome.fa' results = _bf_and_process(reads_fns, transcriptome_fn) - self.assertEqual(results, []) + self.assertEqual(results.shape, (0, 3)) def test_small_data(self): - """exfi.find_exons._bf_and_process: Process an empty transcriptome and a small BF""" + """exfi.find_exons._bf_and_process: Process an empty transcriptome and + a small BF""" reads_fns = [ 'tests/find_exons/reads_1.fq', 'tests/find_exons/reads_2.fq' ] transcriptome_fn = 'tests/find_exons/small_transcriptome.fa' results = _bf_and_process(reads_fns, transcriptome_fn) - self.assertEqual(results, []) + self.assertEqual(results.shape, (0, 3)) if __name__ == "__main__": diff --git a/tests/test_io/test_fasta_to_dict.py b/tests/test_io/test_fasta_to_dict.py index fdd4c26..e6ed705 100644 --- a/tests/test_io/test_fasta_to_dict.py +++ b/tests/test_io/test_fasta_to_dict.py @@ -9,7 +9,6 @@ from exfi.io.fasta_to_dict import \ fasta_to_dict -from exfi.classes import FastaDict from tests.custom_assertions import \ CustomAssertions @@ -20,15 +19,15 @@ EXONS_SPACED_FN = "tests/io/spaced.fa" EXONS_TABBED_FN = "tests/io/tabbed.fa" -EXONS_EMPTY_DICT = FastaDict() -EXONS_SIMPLE_DICT = FastaDict({ +EXONS_EMPTY_DICT = {} +EXONS_SIMPLE_DICT = { 'ENSDART00000161035.1:0-326': 'TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAA' 'GTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT' 'CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGAC' 'ACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA' -}) -EXONS_COMPLEX_DICT = FastaDict({ +} +EXONS_COMPLEX_DICT = { 'ENSDART00000161035.1:0-326': 'TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAA' 'GTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT' @@ -70,7 +69,7 @@ 'ENSDART00000165342.1:1176-1324': 'CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAG' 'ATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG' -}) +} class TestFastaToDict(TestCase, CustomAssertions): From 52108aa78395b8f734ca20e13bad5cd26386bd3d Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 14 Jan 2019 17:25:34 +0100 Subject: [PATCH 19/45] Tests for exfi.correct. Renaming of functions in exfi.find_exons --- bin/build_splice_graph | 4 +- exfi/correct.py | 19 +- exfi/find_exons.py | 10 +- tests/auxiliary_functions.py | 12 +- tests/test_correct.py | 326 ++++++++--------------------------- 5 files changed, 92 insertions(+), 279 deletions(-) diff --git a/bin/build_splice_graph b/bin/build_splice_graph index abd8398..81b7b44 100755 --- a/bin/build_splice_graph +++ b/bin/build_splice_graph @@ -12,7 +12,7 @@ from Bio import SeqIO from exfi import __version__ -from exfi.find_exons import _find_exons_pipeline +from exfi.find_exons import find_exons from exfi.io.bed import bed3_to_bed4 from exfi.io.fasta_to_dict import fasta_to_dict from exfi.polish import polish_bed4 @@ -175,7 +175,7 @@ if __name__ == "__main__": logger.setLevel(logging.DEBUG) # Get predicted exons in bed format - bed3 = _find_exons_pipeline(args) + bed3 = find_exons(args) bed4 = bed3_to_bed4(bed3) # Transcriptome_dict diff --git a/exfi/correct.py b/exfi/correct.py index e436665..a20e3da 100755 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -72,7 +72,7 @@ def prepare_sealer(bed4, transcriptome_dict, args): return sealer_input[1] -def _run_sealer(sealer_input_fn: str, args: dict) -> str: +def run_sealer(sealer_input_fn: str, args: dict) -> str: """Run abyss-sealer with the parameters in args, and the scaffold in sealer_input. @@ -118,10 +118,9 @@ def collect_sealer_results(filename): """Read the fasta output from sealer and return the merged nodes""" if os.path.getsize(filename) == 0: - return pd.DataFrame(data=None, columns=["raw"]) + return pd.DataFrame(data=None, columns=["u", "v"]) headers = pd.read_csv(filename, header=None, sep="\t") - print(headers) headers = headers.iloc[::2] # Take odd rows: headers. headers.columns = ["raw"] @@ -132,6 +131,7 @@ def collect_sealer_results(filename): .str.split("~") headers["u"], headers["v"] = headers.clean.str headers = headers[["u", "v"]] + headers = headers.reset_index(drop=True) return headers @@ -141,19 +141,20 @@ def apply_correction_to_bed4(bed4, sealed_edges): return bed4 new_bed4 = bed4.copy().set_index("name") for row in sealed_edges.iloc[::-1].itertuples(): - new_bed4.loc[row.u, "chromEnd"] = new_bed4.loc[row.v, "chromStart"] - new_bed4 = new_bed4.drop(sealed_edges["v"].values) + new_bed4.loc[row.u, "chromEnd"] = new_bed4.loc[row.v, "chromEnd"] + new_bed4 = new_bed4.drop(sealed_edges.v) new_bed4 = bed3_to_bed4(new_bed4[["chrom", "chromStart", "chromEnd"]]) - return new_bed4 - + return new_bed4.reset_index(drop=True) def correct_bed4(bed4, transcriptome_dict, args): """Inspect the bed4 for small gaps and overlaps, write a fasta file for sealer, and correct the bed4. """ - sealer_input_fn = prepare_sealer(bed4=bed4, transcriptome_dict=transcriptome_dict, args=args) - output_sealer_fn = _run_sealer(sealer_input_fn=sealer_input_fn, args=args) + sealer_input_fn = prepare_sealer( + bed4=bed4, transcriptome_dict=transcriptome_dict, args=args + ) + output_sealer_fn = run_sealer(sealer_input_fn=sealer_input_fn, args=args) sealer_results = collect_sealer_results(filename=output_sealer_fn) bed4_corrected = apply_correction_to_bed4(bed4, sealer_results) return bed4_corrected diff --git a/exfi/find_exons.py b/exfi/find_exons.py index 578bd20..f73ca3d 100644 --- a/exfi/find_exons.py +++ b/exfi/find_exons.py @@ -16,7 +16,7 @@ import pandas as pd import numpy as np -def _process_output(process): +def process_output(process): """Get lines in bed format from the output of a Popen. :param Popen process: Popen object. @@ -39,7 +39,7 @@ def _process_output(process): return bed3 -def _get_fasta(transcriptome_dict, iterable_of_bed): +def get_fasta(transcriptome_dict, iterable_of_bed): """Extract subsequences in trancriptome_fn according to locis. :param dict transcriptome_dict: FastaDict of the transcriptome @@ -53,8 +53,8 @@ def _get_fasta(transcriptome_dict, iterable_of_bed): yield (identifier, seq) -def _find_exons_pipeline(args): - """Find exons according to the Bloom filter -> BED +def find_exons(args): + """Find exons according to the Bloom filter -> BED3 Main pipeline: - Check every kmer, @@ -91,4 +91,4 @@ def _find_exons_pipeline(args): p_filter = Popen(c_filter, stdin=p_merge1.stdout, stdout=PIPE) p_merge2 = Popen(c_merge2, stdin=p_filter.stdout, stdout=PIPE) p_kmers.stdout.close() - return _process_output(p_merge2) + return process_output(p_merge2) diff --git a/tests/auxiliary_functions.py b/tests/auxiliary_functions.py index 2d155d0..7882faa 100644 --- a/tests/auxiliary_functions.py +++ b/tests/auxiliary_functions.py @@ -16,9 +16,9 @@ from Bio.SeqIO.FastaIO import SimpleFastaParser from exfi.find_exons import \ - _process_output, \ - _get_fasta, \ - _find_exons_pipeline + process_output, \ + get_fasta, \ + find_exons from exfi.build_baited_bloom_filter import \ _get_build_bf_command @@ -26,7 +26,7 @@ def _command_to_list(command): """Execute command and return output as list of strings""" process = Popen(command, stdout=PIPE, shell=False) - results = _process_output(process) + results = process_output(process) return results @@ -40,7 +40,7 @@ def _fasta_to_list(filename): def _getfasta_to_list(transcriptome_dict, iterable_of_bed): """Convert to a list the generator from getfasta""" - return list(_get_fasta(transcriptome_dict, iterable_of_bed)) + return list(get_fasta(transcriptome_dict, iterable_of_bed)) @@ -76,7 +76,7 @@ def _bf_and_process(reads_fns, transcriptome_fn): command = _get_build_bf_command(args, reads_fns) process = _silent_popen(command) process.wait() - results = _find_exons_pipeline(args) + results = find_exons(args) shutil.rmtree(tmp_dir) bed3 = pd.DataFrame( data=results, diff --git a/tests/test_correct.py b/tests/test_correct.py index 838eb7f..cc80ab8 100644 --- a/tests/test_correct.py +++ b/tests/test_correct.py @@ -14,33 +14,28 @@ from os import remove from os.path import dirname -import networkx as nx +import pandas as pd -from exfi.classes import SpliceGraph +import networkx as nx from exfi.build_baited_bloom_filter import \ build_baited_bloom_filter from exfi.find_exons import \ - _find_exons_pipeline - -from exfi.build_splice_graph_dict import \ - build_splice_graph_dict + find_exons from exfi.io.fasta_to_dict import \ fasta_to_dict +from exfi.io.bed import \ + bed3_to_bed4 + from exfi.correct import \ - _prepare_sealer, \ - _run_sealer, \ - _collect_sealer_results, \ - _filled_edges_by_transcript, \ - _rename_nodes_from_collapse, \ - _recompute_node2coord, \ - _recompute_edge2overlap, \ - _compute_new_node_ids, \ - _sculpt_graph, \ - correct_splice_graph_dict + prepare_sealer, \ + run_sealer, \ + collect_sealer_results, \ + apply_correction_to_bed4, \ + correct_bed4 from tests.custom_assertions import \ CustomAssertions @@ -69,93 +64,28 @@ def _compose_args(bloom_fn: str, gfa_fn: str) -> dict: TEMP_GFA = TEMP[1] + ".gfa" ARGS = _compose_args(TEMP_BLOOM, TEMP_GFA) build_baited_bloom_filter(ARGS) -POSITIVE_EXONS_BED = list(_find_exons_pipeline(ARGS)) -SPLICE_GRAPH_DICT = build_splice_graph_dict(POSITIVE_EXONS_BED, ARGS) -SPLICE_GRAPH = SPLICE_GRAPH_DICT["ENSDART00000149335.2"] -EDGE2FILL = { - ('ENSDART00000149335.2:485-1715', 'ENSDART00000149335.2:1717-2286'), - ('ENSDART00000149335.2:1717-2286', 'ENSDART00000149335.2:2288-3379') -} -FILLED_EDGE_BY_TRANSCRIPT = { - 'ENSDART00000149335.2': EDGE2FILL -} - - -def partition(node_u: str, node_v: str, edge2fill: set): - """Define partitions as how the graph should be filled""" - graph = SpliceGraph() - graph.add_edges_from(edge2fill) - if node_u in graph.nodes() and \ - node_v in graph.nodes() and \ - nx.has_path(G=graph, source=node_u, target=node_v): - return True - return False - -FULL_PARTITION = lambda u, v: partition(u, v, EDGE2FILL) -COLLAPSED_GRAPH = nx.quotient_graph(SPLICE_GRAPH, partition=FULL_PARTITION) - -QUOTIENT_RELABELING = { - frozenset({ - 'ENSDART00000149335.2:0-486' - }): - 'ENSDART00000149335.2:0-486', - frozenset({ - 'ENSDART00000149335.2:485-1715', - 'ENSDART00000149335.2:1717-2286', - 'ENSDART00000149335.2:2288-3379' - }): ( - 'ENSDART00000149335.2:485-1715', - 'ENSDART00000149335.2:1717-2286', - 'ENSDART00000149335.2:2288-3379' - ) -} - -QUOTIENT_RELABELED = nx.relabel_nodes( - copy=True, - G=COLLAPSED_GRAPH, - mapping=QUOTIENT_RELABELING +BED3 = find_exons(ARGS) +BED4 = bed3_to_bed4(BED3) +TRANSCRIPTOME_DICT = fasta_to_dict(ARGS["fasta"]) + + + +SEALED_EDGES = pd.DataFrame( + data=[ + ["ENSDART00000149335.2:485-1715", "ENSDART00000149335.2:1717-2286"], + ["ENSDART00000149335.2:1717-2286", "ENSDART00000149335.2:2288-3379"] + ], + columns=["u", "v"] ) -NEW_NODE2COORD = { - 'ENSDART00000149335.2:0-486': - (('ENSDART00000149335.2', 0, 486),), - ('ENSDART00000149335.2:485-1715', 'ENSDART00000149335.2:1717-2286', - 'ENSDART00000149335.2:2288-3379'): - (('ENSDART00000149335.2', 485, 3379),) -} - -NEW_EDGE2OVERLAP = { - ('ENSDART00000149335.2:485-1715', - 'ENSDART00000149335.2:1717-2286', - 'ENSDART00000149335.2:2288-3379'): - (('ENSDART00000149335.2', 485, 3379),), - 'ENSDART00000149335.2:0-486': - (('ENSDART00000149335.2', 0, 486),) -} - -NEW_NODE_IDS = { - ('ENSDART00000149335.2:485-1715', 'ENSDART00000149335.2:1717-2286', - 'ENSDART00000149335.2:2288-3379'): - 'ENSDART00000149335.2:485-3379', - 'ENSDART00000149335.2:0-486': - 'ENSDART00000149335.2:0-486' -} - -SEALED_GRAPH = nx.DiGraph() -SEALED_GRAPH.add_nodes_from(["ENSDART00000149335.2:0-486", "ENSDART00000149335.2:485-3379"]) -nx.set_node_attributes(SEALED_GRAPH, name="coordinates", values={ - "ENSDART00000149335.2:0-486": (("ENSDART00000149335.2", 0, 486),), - "ENSDART00000149335.2:485-3379": (("ENSDART00000149335.2", 485, 3379),) -}) -SEALED_GRAPH.add_edge( - u="ENSDART00000149335.2:0-486", - v="ENSDART00000149335.2:485-3379" +BED4_CORRECTED = pd.DataFrame( + data=[ + ["ENSDART00000149335.2", 0, 486, "ENSDART00000149335.2:0-486"], + ["ENSDART00000149335.2", 485, 3379, "ENSDART00000149335.2:485-3379"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] ) -nx.set_edge_attributes(SEALED_GRAPH, name="overlaps", values={ - ("ENSDART00000149335.2:0-486", "ENSDART00000149335.2:485-3379"): 1 -}) -SEALED_GRAPH_DICT = {"ENSDART00000149335.2": SEALED_GRAPH} def tearDownModule(): """Remove temporary bloom and temporary GFA files""" @@ -172,7 +102,7 @@ class TestPrepareSealer(TestCase, CustomAssertions): def test_file_creation(self): """exfi.correct._prepare_sealer: test creation""" - sealer_input_fn = _prepare_sealer(SPLICE_GRAPH_DICT, ARGS) + sealer_input_fn = prepare_sealer(BED4, TRANSCRIPTOME_DICT, ARGS) actual = fasta_to_dict(sealer_input_fn) expected = fasta_to_dict("tests/correct/to_seal.fa") remove(sealer_input_fn) @@ -181,14 +111,16 @@ def test_file_creation(self): class TestRunSealer(TestCase, CustomAssertions): - """_run_sealer(sealer_input_fn, args): + """run_sealer(sealer_input_fn, args): (str, dict) -> str """ def test_run(self): - """exfi.correct._run_sealer: test if runs""" - sealer_in_fn = _prepare_sealer(SPLICE_GRAPH_DICT, ARGS) - sealer_out_fn = _run_sealer(sealer_input_fn=sealer_in_fn, args=ARGS) + """exfi.correct.run_sealer: test if runs""" + sealer_in_fn = prepare_sealer( + bed4=BED4, transcriptome_dict=TRANSCRIPTOME_DICT, args=ARGS + ) + sealer_out_fn = run_sealer(sealer_input_fn=sealer_in_fn, args=ARGS) actual = fasta_to_dict("tests/correct/sealed.fa") expected = fasta_to_dict(sealer_out_fn) remove(sealer_in_fn) @@ -198,180 +130,60 @@ def test_run(self): class TestCollectSealerResults(TestCase): - """_collect_sealer_results(handle): + """collect_sealer_results(handle): (str) -> dict """ def test_collect_empty(self): - """exfi.correct._collect_sealer_results: empty case""" + """exfi.correct.collect_sealer_results: empty case""" empty_file = mkstemp() - sealer_output_fn = _run_sealer(sealer_input_fn=empty_file[1], args=ARGS) - edge2fill = _collect_sealer_results(handle=sealer_output_fn) + sealer_output_fn = run_sealer(sealer_input_fn=empty_file[1], args=ARGS) + observed = collect_sealer_results(filename=sealer_output_fn) remove(empty_file[1]) remove(sealer_output_fn) - self.assertEqual(edge2fill, set()) + print("shape = {}\n".format(observed.shape)) + self.assertTrue(observed.shape == (0, 2)) def test_collect_somedata(self): - """exfi.correct._collect_sealer_results: some data""" - edge2fill = _collect_sealer_results( - handle="tests/correct/sealed.fa" - ) - self.assertEqual(edge2fill, EDGE2FILL) - - - -class TestFilledEdgeByTranscript(TestCase, CustomAssertions): - """Tests for _filled_edges_by_transcript(splice_graph: nx.DiGraph, filled_edges: str) -> dict""" - - def test_empty(self): - """exfi.correct._filled_edge_by_transcript: empty case""" - initial = {} - actual = _filled_edges_by_transcript(filled_edges=initial) - expected = {} - self.assertEqualDict(actual, expected) - - def test_some_data(self): - """exfi.correct._filled_edge_by_transcript: some data""" - initial = _collect_sealer_results( - handle="tests/correct/sealed.fa" - ) - actual = _filled_edges_by_transcript(filled_edges=initial) - expected = FILLED_EDGE_BY_TRANSCRIPT - self.assertEqualDict(actual, expected) - - - -class TestRenameNodesFromCollapse(TestCase, CustomAssertions): - """Tests for _rename_nodes_from_collapse - - _rename_nodes_from_collapse(quotient_graph: nx.DiGraph) -> dict - """ - - def test_empty(self): - """exfi.correct._rename_nodes_from_collapse: empty case""" - initial = nx.DiGraph() - actual = _rename_nodes_from_collapse(initial) - expected = {} - self.assertEqualDict(actual, expected) - - def test_some_data(self): - """exfi.correct._rename_nodes_from_collapse: some data""" - actual = _rename_nodes_from_collapse(COLLAPSED_GRAPH) - expected = QUOTIENT_RELABELING - self.assertEqualDict(actual, expected) - - - - -class TestRecomputeNode2Coord(TestCase, CustomAssertions): - """Tests for exfi.correct._recompute_node2coord - - _recompute_node2coord(component: nx.DiGraph, quotient_relabeled: nx.DiGraph) -> dict - """ - - def test_empty(self): - """exfi.correct._recompute_node2coord: empty case""" - actual = _recompute_node2coord(nx.DiGraph(), nx.DiGraph()) - expected = {} - self.assertEqualDict(actual, expected) - - def test_some_data(self): - """exfi.correct._recompute_node2coord: some data""" - actual = _recompute_node2coord( - component=SPLICE_GRAPH, - quotient_relabeled=QUOTIENT_RELABELED - ) - expected = NEW_NODE2COORD - self.assertEqualDict(actual, expected) - - - -class TestRecomputeEdge2Overlap(TestCase, CustomAssertions): - """Tests for _recompute_edge2overlap - - _recompute_edge2overlap(component: nx.DiGraph, quotient_relabeled: nx.DiGraph) -> dict - """ - - def test_empty(self): - """exfi.correct._recompute_edge2overlap: empty case""" - actual = _recompute_edge2overlap(nx.DiGraph(), nx.DiGraph()) - expected = {} - self.assertEqualDict(actual, expected) - - def test_some_data(self): - """exfi.correct._recompute_edge2overlap: some data""" - actual = _recompute_node2coord( - component=SPLICE_GRAPH, - quotient_relabeled=QUOTIENT_RELABELED + """exfi.correct.collect_sealer_results: some data""" + observed = collect_sealer_results( + filename="tests/correct/sealed.fa" ) - expected = NEW_EDGE2OVERLAP - self.assertEqualDict(actual, expected) + print("observed:\n", observed) + print("expected:\n", SEALED_EDGES) + self.assertTrue(observed.equals(SEALED_EDGES)) -class TestComputeNewNodeIds(TestCase): - """Test for exfi.correct._compute_new_node_ids +class TestApplySealerCorrection(TestCase): + """apply_correction_to_bed4(bed4, sealed_edges) -> bed4_corrected""" - _compute_new_node_ids(quotient_relabeled: nx.DiGraph, component: nx.DiGraph) -> dict - """ - def test_empty(self): - """exfi.correct._compute_new_node_ids: empty case""" - actual = _compute_new_node_ids(nx.DiGraph(), nx.DiGraph()) - expected = {} - self.assertEqual(actual, expected) + def test_empty_sealed(self): + """exfi.correct.apply_correction_to_bed4: no sealing""" + no_sealing = pd.DataFrame(columns=["u", "v"]) + observed = apply_correction_to_bed4(BED4, no_sealing) + self.assertTrue(BED4.equals(observed)) def test_some_data(self): - """exfi.correct._compute_new_node_ids: some data""" - actual = _compute_new_node_ids(QUOTIENT_RELABELED, SPLICE_GRAPH) - expected = NEW_NODE_IDS - self.assertEqual(actual, expected) + """exfi.correct.apply_correction_to_bed4: no sealing""" + observed = apply_correction_to_bed4(BED4, SEALED_EDGES) + print("BED4:\n", BED4) + print("Observed:\n", observed) + print("Expected:\n", BED4_CORRECTED) + self.assertTrue(observed.equals(BED4_CORRECTED)) +class TestCorrectBED4(TestCase): + """correct_bed4(bed4, transcriptome_dict, args) -> bed4_corrected""" -class TestSculptGraph(TestCase): - """Tests for exfi.correct._sculpt_graph - - _sculpt_graph(splice_graph: nx.DiGraph, filled_edges: set) -> nx.DiGraph - """ - - def test_sculpt_empty_data(self): - """exfi.correct._sculpt_graph: empty case""" - sealed_graph = _sculpt_graph(SPLICE_GRAPH, {}) - self.assertTrue(nx.is_isomorphic( - sealed_graph, - SPLICE_GRAPH - )) - - - def test_scuplt_real_data(self): - """exfi.correct._sculpt_graph: some data""" - test_graph = nx.DiGraph() - test_graph.add_edge( - u="ENSDART00000149335.2:0-486", - v="ENSDART00000149335.2:485-3379" - ) - edge2fill = _collect_sealer_results( - handle="tests/correct/sealed.fa" + def test_simple(self): + """exfi.correct.correct_bed4: some data""" + observed = correct_bed4( + bed4=BED4, transcriptome_dict=TRANSCRIPTOME_DICT, args=ARGS ) - sealed_graph = _sculpt_graph( - splice_graph=SPLICE_GRAPH, filled_edges=edge2fill - ) - self.assertTrue(nx.is_isomorphic( - sealed_graph, - test_graph - )) - - - -class TestCorrectSpliceGraphDict(TestCase, CustomAssertions): - """Tests for exfi.correct_dict.correct_splice_graph_dict""" - - def test_correct_splice_graph_dict(self): - """exfi.correct.correct_splice_graph: some data""" + expected = BED4_CORRECTED + self.assertTrue(observed.equals(expected)) - splice_graph_dict = build_splice_graph_dict(POSITIVE_EXONS_BED, ARGS) - sealed_graph_dict = correct_splice_graph_dict(splice_graph_dict, ARGS) - self.assertEqualDictOfSpliceGraphs(sealed_graph_dict, SEALED_GRAPH_DICT) From 6069f1dc8b43236d35d2a38b96f65b0693b046cf Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 14 Jan 2019 17:59:14 +0100 Subject: [PATCH 20/45] Tests for exfi.polish --- exfi/polish.py | 2 +- tests/test_correct.py | 2 - tests/test_polish.py | 204 ++++++++++++++---------------------------- 3 files changed, 67 insertions(+), 141 deletions(-) diff --git a/exfi/polish.py b/exfi/polish.py index 5efb07c..cb8b317 100755 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -90,4 +90,4 @@ def polish_bed4(bed4, transcriptome_dict): bed4_new.chromStart.map(str) + "-" + \ bed4_new.chromEnd.map(str) - return bed4_new + return bed4_new[["chrom", "chromStart", "chromEnd", "name"]] diff --git a/tests/test_correct.py b/tests/test_correct.py index cc80ab8..16b1ae5 100644 --- a/tests/test_correct.py +++ b/tests/test_correct.py @@ -16,8 +16,6 @@ import pandas as pd -import networkx as nx - from exfi.build_baited_bloom_filter import \ build_baited_bloom_filter diff --git a/tests/test_polish.py b/tests/test_polish.py index 5c6b195..a2d795f 100644 --- a/tests/test_polish.py +++ b/tests/test_polish.py @@ -6,169 +6,97 @@ TestCase, \ main -import networkx as nx +import pandas as pd from exfi.polish import \ - trim_start, \ - trim_end, \ - polish_splice_graph, \ - polish_splice_graph_dict - -from exfi.classes import Coordinate, SpliceGraph, SpliceGraphDict - -from tests.custom_assertions import \ - CustomAssertions + polish_bed4 from tests.data import \ - SPLICE_GRAPH_EMPTY, SPLICE_GRAPH_SIMPLE, SPLICE_GRAPH_COMPLEX, \ - SPLICE_GRAPH_EMPTY_DICT, SPLICE_GRAPH_SIMPLE_DICT, SPLICE_GRAPH_COMPLEX_DICT, \ - TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, TRANSCRIPTOME_COMPLEX_DICT, \ - NODE2COORDS_COMPLEX_PART1, NODE2COORDS_COMPLEX_PART2, \ - OVERLAPS_COMPLEX, \ - SPLICE_GRAPH_1, SPLICE_GRAPH_2 - + TRANSCRIPTOME_EMPTY_DICT, \ + TRANSCRIPTOME_SIMPLE_DICT, \ + TRANSCRIPTOME_COMPLEX_DICT # Test data -POLISHED_EMPTY = SpliceGraph() -POLISHED_SIMPLE = SpliceGraph() -POLISHED_SIMPLE.add_node("ENSDART00000161035.1:0-326") -nx.set_node_attributes( - G=POLISHED_SIMPLE, - name="coordinates", - values={'ENSDART00000161035.1:0-326': (Coordinate('ENSDART00000161035.1', 0, 326),)} +BED4_EMPTY = pd.DataFrame( + data=None, columns=["chrom", "chromStart", "chromEnd", "name"] ) - -POLISHED_COMPLEX = SpliceGraph() -POLISHED_COMPLEX.add_nodes_from( - tuple(NODE2COORDS_COMPLEX_PART1.keys()) + tuple(NODE2COORDS_COMPLEX_PART2.keys()) +BED4_SIMPLE = pd.DataFrame( + data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], + columns=["chrom", "chromStart", "chromEnd", "name"] ) -nx.set_node_attributes( - G=POLISHED_COMPLEX, - name="coordinates", - values={**NODE2COORDS_COMPLEX_PART1, **NODE2COORDS_COMPLEX_PART2} -) -POLISHED_COMPLEX.add_edges_from(OVERLAPS_COMPLEX.keys()) -nx.set_edge_attributes( - G=POLISHED_COMPLEX, - name="overlaps", - values=OVERLAPS_COMPLEX -) - -POLISHED_EMPTY_DICT = SpliceGraphDict() -POLISHED_SIMPLE_DICT = SpliceGraphDict({"ENSDART00000161035.1": POLISHED_SIMPLE}) -POLISHED_COMPLEX_DICT = SpliceGraphDict({ - 'ENSDART00000161035.1': SPLICE_GRAPH_1, - 'ENSDART00000165342.1': SPLICE_GRAPH_2 -}) - -ARGS = { - "threads": 1 -} - - - -class TestCoordAddLeft(TestCase): - """Tests for exfi.polish.trim_start""" - - def test_empty(self): - """exfi.polish.trim_start: empty case""" - with self.assertRaises(IndexError): - trim_start([], 1) - - def test_short(self): - """exfi.polish.trim_start: short case""" - with self.assertRaises(IndexError): - trim_start([1], 1) - - def test_correct(self): - """exfi.polish.trim_start: correct case""" - actual = trim_start(("tr", 1, 2), 2) - expected = Coordinate("tr", 3, 2) - self.assertEqual(actual, expected) - def test_too_long(self): - """exfi.polish.trim_start: too long case""" - actual = trim_start(("tr", 1, 2, 5, 5), 2) - expected = Coordinate("tr", 3, 2) - self.assertEqual(actual, expected) - - - -class TestCoordAddRight(TestCase): - """Tests for exfi.polish.trim_end""" - - def test_empty(self): - """exfi.polish.trim_end: empty case""" - with self.assertRaises(IndexError): - trim_end([], 1) - - def test_short(self): - """exfi.polish.trim_end: short case""" - with self.assertRaises(IndexError): - trim_end([1], 1) - - def test_correct(self): - """exfi.polish.trim_end: correct case""" - actual = trim_end(("tr", 1, 2), 2) - expected = Coordinate("tr", 1, 0) - self.assertEqual(actual, expected) - - def test_too_long(self): - """exfi.polish.trim_end: too long case""" - actual = trim_end(("tr", 1, 2, 5, 5), 2) - expected = Coordinate("tr", 1, 0) - self.assertEqual(actual, expected) +BED4_COMPLEX = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], + ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], + ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], + ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"], + ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], + ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], + ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], + ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], + ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], + ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], + ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], + ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], + ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], + ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], + ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] +) +BED4_SIMPLE_POLISHED = BED4_SIMPLE +BED4_COMPLEX_POLISHED = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], + ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], + ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], + ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"], + ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], + ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], + ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], + ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], + ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], + ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], + ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], + ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], + ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], + ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], + ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] +) -class TestPolishSpliceGraph(TestCase, CustomAssertions): - """Tests for exfi.polish.polish_splice_graph""" - def test_empty(self): - """exfi.polish.polish_splice_graph: empty case""" - actual = polish_splice_graph(SPLICE_GRAPH_EMPTY, TRANSCRIPTOME_EMPTY_DICT) - expected = POLISHED_EMPTY - self.assertEqualSpliceGraphs(actual, expected) - def test_simple(self): - """exfi.polish.polish_splice_graph: simple case""" - actual = polish_splice_graph(SPLICE_GRAPH_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) - expected = POLISHED_SIMPLE - self.assertEqualSpliceGraphs(actual, expected) - - def test_complex(self): - """exfi.polish.polish_splice_graph: complex case""" - actual = polish_splice_graph(SPLICE_GRAPH_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) - expected = POLISHED_COMPLEX - print(actual) - print(expected) - self.assertEqualSpliceGraphs(actual, expected) +class TestPolishBED4(TestCase): + """Tests for exfi.polish.polish_bed4""" -class TestPolishSpliceGraphDict(TestCase, CustomAssertions): - """Tests for exfi.polish.polish_splice_graph_dict""" def test_empty(self): - """exfi.polish.polish_splice_graph_dict: empty case""" - actual = polish_splice_graph_dict(SPLICE_GRAPH_EMPTY_DICT, TRANSCRIPTOME_EMPTY_DICT, ARGS) - expected = POLISHED_EMPTY_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) + """exfi.polish.polish_bed4: empty case""" + observed = polish_bed4(BED4_EMPTY, TRANSCRIPTOME_EMPTY_DICT) + self.assertTrue(observed.shape == (0, 4)) def test_simple(self): - """exfi.polish.polish_splice_graph_dict: simple case""" - actual = polish_splice_graph_dict(SPLICE_GRAPH_SIMPLE_DICT, TRANSCRIPTOME_SIMPLE_DICT, ARGS) - expected = POLISHED_SIMPLE_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) + """exfi.polish.polish_bed4: simple case""" + observed = polish_bed4(BED4_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) + print("Observed:\n", observed) + print("Expected:\n", BED4_SIMPLE_POLISHED) + self.assertTrue(observed.equals(BED4_SIMPLE_POLISHED)) def test_complex(self): - """exfi.polish.polish_splice_graph_dict: complex case""" - actual = polish_splice_graph_dict( - SPLICE_GRAPH_COMPLEX_DICT, TRANSCRIPTOME_COMPLEX_DICT, ARGS) - expected = POLISHED_COMPLEX_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) + """exfi.polish.polish_bed4: complex case""" + observed = polish_bed4(BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) + print("Observed:\n", observed) + print("Expected:\n", BED4_COMPLEX_POLISHED) + self.assertTrue(observed.equals(BED4_COMPLEX_POLISHED)) From 6244577c724614f060a4269222b0d658470d0fd2 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Tue, 15 Jan 2019 12:37:19 +0100 Subject: [PATCH 21/45] Tests for exfi.io.bed.py --- exfi/io/bed.py | 2 +- tests/io/bed.py | 239 ++++++++++++++++++++++++++++++++ tests/io/transcriptome_dicts.py | 56 ++++++++ tests/test_io/test_bed.py | 128 +++++++++++++++++ 4 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 tests/io/bed.py create mode 100644 tests/io/transcriptome_dicts.py create mode 100644 tests/test_io/test_bed.py diff --git a/exfi/io/bed.py b/exfi/io/bed.py index 4bacb6f..c6a7ad5 100644 --- a/exfi/io/bed.py +++ b/exfi/io/bed.py @@ -75,4 +75,4 @@ def bed4_to_edge2overlap(bed4): overlaps = overlaps\ [["name", "nameNext", "overlap"]]\ .rename({"name": "u", "nameNext": "v"}, axis=1) - return overlaps + return overlaps.reset_index(drop=True) diff --git a/tests/io/bed.py b/tests/io/bed.py new file mode 100644 index 0000000..7e88d1f --- /dev/null +++ b/tests/io/bed.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 + +"""tests.io.bed.py: variables for testing bed dataframes""" + +import pandas as pd +import numpy as np + +BED3_EMPTY = pd.DataFrame( + data=None, + columns=["chrom", "chromStart", "chromEnd"] +) + +BED3_SIMPLE = pd.DataFrame( + data=[("ENSDART00000161035.1", 0, 326)], + columns=["chrom", "chromStart", "chromEnd"] +) + +BED3_COMPLEX = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 0, 326], + ["ENSDART00000161035.1", 397, 472], + ["ENSDART00000161035.1", 477, 523], + ["ENSDART00000165342.1", 5, 127], + ["ENSDART00000165342.1", 125, 304], + ["ENSDART00000165342.1", 317, 460], + ["ENSDART00000165342.1", 459, 592], + ["ENSDART00000165342.1", 591, 650], + ["ENSDART00000165342.1", 645, 746], + ["ENSDART00000165342.1", 746, 851], + ["ENSDART00000165342.1", 854, 886], + ["ENSDART00000165342.1", 899, 953], + ["ENSDART00000165342.1", 974, 1097], + ["ENSDART00000165342.1", 1098, 1175], + ["ENSDART00000165342.1", 1176, 1324] + ], + columns=["chrom", "chromStart", "chromEnd"] +) + + + +BED4_EMPTY = pd.DataFrame( + data=None, columns=["chrom", "chromStart", "chromEnd", "name"] +) + +BED4_SIMPLE = pd.DataFrame( + data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], + columns=["chrom", "chromStart", "chromEnd", "name"] +) + +BED4_COMPLEX = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], + ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], + ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], + ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], + ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], + ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], + ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], + ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], + ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], + ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], + ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], + ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"], + ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], + ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], + ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] +) + + + + +NODE2COORDINATES_EMPTY = pd.DataFrame( + data=None, + columns=["chrom", "chromStart", "chromEnd", "name"] +).set_index("name") + +NODE2COORDINATES_SIMPLE = pd.DataFrame( + data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], + columns=["chrom", "chromStart", "chromEnd", "name"] +).set_index("name") + +NODE2COORDINATES_COMPLEX = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], + ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], + ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], + ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], + ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], + ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], + ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], + ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], + ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], + ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], + ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], + ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"], + ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], + ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], + ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] +).set_index("name") + + + +PATH2NODES_EMPTY = {} + +PATH2NODES_SIMPLE = { + "ENSDART00000161035.1" : ["ENSDART00000161035.1:0-326"] +} + +PATH2NODES_COMPLEX = { + "ENSDART00000161035.1": [ + "ENSDART00000161035.1:0-326", "ENSDART00000161035.1:397-472", + "ENSDART00000161035.1:477-523" + ], + "ENSDART00000165342.1": [ + "ENSDART00000165342.1:5-127", "ENSDART00000165342.1:125-304", + "ENSDART00000165342.1:317-460", "ENSDART00000165342.1:459-592", + "ENSDART00000165342.1:591-650", "ENSDART00000165342.1:645-746", + "ENSDART00000165342.1:746-851", "ENSDART00000165342.1:854-886", + "ENSDART00000165342.1:899-953", "ENSDART00000165342.1:974-1097", + "ENSDART00000165342.1:1098-1175", "ENSDART00000165342.1:1176-1324" + ] +} + + + +NODE2SEQUENCE_EMPTY = pd.DataFrame(columns=["name", "sequence"]) + +NODE2SEQUENCE_SIMPLE = pd.DataFrame( + data=[ + ["ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA"] + ], + columns=["name", "sequence"] +) + +NODE2SEQUENCE_COMPLEX = pd.DataFrame( + data=[ + [ + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTT" + "GATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAG" + "GGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCT" + "CTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCA" + "GCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" + ], [ + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCC" + "AAATCAACA" + ], [ + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" + ], [ + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAA" + "TCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG" + ], [ + "ENSDART00000165342.1:125-304", + "AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGA" + "CACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGC" + "TCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" + ], [ + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCC" + "TGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAA" + "GAAGTTACCAG" + ], [ + "ENSDART00000165342.1:459-592", + "GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTT" + "CTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTA" + "A" + ], [ + "ENSDART00000165342.1:591-650", + "AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT" + ], [ + "ENSDART00000165342.1:645-746", + "GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTT" + "GGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT" + ], [ + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTG" + "GGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" + ], [ + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + ], [ + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" + ], [ + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAG" + "TAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" + ], [ + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTC" + "ACATATCCTGA" + ], [ + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATT" + "GAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTG" + "GCAAAACACCACAGCG" + ] + ], + columns=["name", "sequence"] +) + + + +EDGE2OVERLAP_EMPTY = pd.DataFrame(columns=["u", "v", "overlap"]) + +EDGE2OVERLAP_SIMPLE = pd.DataFrame(columns=["u", "v", "overlap"]) +EDGE2OVERLAP_SIMPLE = EDGE2OVERLAP_SIMPLE.astype({"overlap": np.int64}) + +EDGE2OVERLAP_COMPLEX = pd.DataFrame( + data=[ + ["ENSDART00000161035.1:0-326", "ENSDART00000161035.1:397-472", -71], + ["ENSDART00000161035.1:397-472", "ENSDART00000161035.1:477-523", -5], + ["ENSDART00000165342.1:5-127", "ENSDART00000165342.1:125-304", 2], + ["ENSDART00000165342.1:125-304", "ENSDART00000165342.1:317-460", -13], + ["ENSDART00000165342.1:317-460", "ENSDART00000165342.1:459-592", 1], + ["ENSDART00000165342.1:459-592", "ENSDART00000165342.1:591-650", 1], + ["ENSDART00000165342.1:591-650", "ENSDART00000165342.1:645-746", 5], + ["ENSDART00000165342.1:645-746", "ENSDART00000165342.1:746-851", 0], + ["ENSDART00000165342.1:746-851", "ENSDART00000165342.1:854-886", -3], + ["ENSDART00000165342.1:854-886", "ENSDART00000165342.1:899-953", -13], + ["ENSDART00000165342.1:899-953", "ENSDART00000165342.1:974-1097", -21], + ["ENSDART00000165342.1:974-1097", "ENSDART00000165342.1:1098-1175", -1], + ["ENSDART00000165342.1:1098-1175", "ENSDART00000165342.1:1176-1324", -1] + ], + columns=["u", "v", "overlap"] +) +EDGE2OVERLAP_COMPLEX = EDGE2OVERLAP_COMPLEX.astype({"overlap": np.int64}) diff --git a/tests/io/transcriptome_dicts.py b/tests/io/transcriptome_dicts.py new file mode 100644 index 0000000..c4d5d5f --- /dev/null +++ b/tests/io/transcriptome_dicts.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +"""tests.io.transcrtiptome_dicts.py: variables for testing""" + +TRANSCRIPTOME_EMPTY_DICT = {} +TRANSCRIPTOME_SIMPLE_DICT = { + "ENSDART00000161035.1": + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCACGACAGCCGGAGGTCCTGCAGAGG" + "TGGCCAAAGATGTGACGGAAGAGCCTTCGCCGGACGACGAGAAGCGGAGGAACTACGGTGGAGTGTATGT" + "GGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACACGCAAAGTCAACAGATGT" + "TTATTGCAGACCTTCAGATAAAACAACATAGAAC" +} +TRANSCRIPTOME_COMPLEX_DICT = { + "ENSDART00000161035.1": + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCACGACAGCCGGAGGTCCTGCAGAGG" + "TGGCCAAAGATGTGACGGAAGAGCCTTCGCCGGACGACGAGAAGCGGAGGAACTACGGTGGAGTGTATGT" + "GGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACACGCAAAGTCAACAGATGT" + "TTATTGCAGACCTTCAGATAAAACAACATAGAAC", + "ENSDART00000165342.1": + "AAATATGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAA" + "ATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAGGACCTGTAGTAGA" + "AACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTAT" + "CTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAAT" + "GGCAAGGAGTTAGAGATGCCACCAAACAGCCACGGATGTGCCTCCAGGAAAGGCAAATGACTGTAACTGA" + "ACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTAC" + "ACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAGTCATGGTTTGGATTCATGGTGGAGGACTCT" + "CTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCAT" + "TCAGTACAGATTGGGTCTTCTGGGGTTCTTAAGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTT" + "CTGGATCAAGTAGCTGCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAG" + "TGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTGATTCTTTCCCCGCTGGCGTCTGG" + "ACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTT" + "CAGAGAGCCCAGACTGCAGCCAAACAATGCAACTGTGACAGCAGCAGTTCAGCAAAGATTGTCGACTGCA" + "TTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAAGTTCCAGATGATGCACTTCTCCGTTGC" + "TGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCT" + "CTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGAATATTTCTTGGGTCCTGAATGGA" + "TGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGATCCCAAGGATCGATG" + "GATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTAT" + "CGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCGGCACAG" + "GAGCTCCAGTGTACCTGTATGAGCTCCAGCATCCGCCCAGCTTCCTCCAGAAGAAGAGGCCTAGCTTCGT" + "GGGTGTGGACCACGCTGATGACCTCTACTTCATCCAGGGCACCTGCTTCGCTAAAGCCCATCTCAGAATA" + "AGCGCTCCTTTCACAGAAGAAGAAAACGAGCTCTGCAGGACTGTGATGGCCTACTGGGGGAACTTTGCAC" + "ACACTGGGTCTCCCAATGGTCCGGGTCTGACACATTGGCCAGAATATGAAAATGAGAATGAGTATCTTGC" + "CATCGGACTGCAACAGAAACCTGGGAAAAACCTTAAGCACAAACACTATGAGTTCATGACCAAGACTCTT" + "CCAGAACTCATACGTCAAGGAAAAACCAAGCACTCAGAACTGTAAAAGAAAAAAAAATGACAAGACAATT" + "AACTGTGATTCAGAATAACAATATTACCATGTAAAGATCAAATAACGACCTTTATTATCACATATATTGA" + "TGGCAATTTTAAAGATCTTTGACAGAACTTTAGCTGTCAGATTGTATTTTAAAAGAAATAACTAATGTTA" + "TGCACTGAAAAATAAAAGTCGGACACCTT" +} diff --git a/tests/test_io/test_bed.py b/tests/test_io/test_bed.py new file mode 100644 index 0000000..0d053b9 --- /dev/null +++ b/tests/test_io/test_bed.py @@ -0,0 +1,128 @@ +#!/usr/bin/ebv python3 + +"""tests.io.bed.py: tests for exfi.io.bed.py""" + +import unittest + +from exfi.io.bed import \ + bed3_to_bed4, \ + bed4_to_node2coordinates, \ + bed4_to_path2nodes, \ + bed4_to_node2sequence, \ + bed4_to_edge2overlap + +from tests.io.bed import \ + BED3_EMPTY, BED3_SIMPLE, BED3_COMPLEX, \ + BED4_EMPTY, BED4_SIMPLE, BED4_COMPLEX, \ + NODE2COORDINATES_EMPTY, NODE2COORDINATES_SIMPLE, NODE2COORDINATES_COMPLEX, \ + PATH2NODES_EMPTY, PATH2NODES_SIMPLE, PATH2NODES_COMPLEX, \ + NODE2SEQUENCE_EMPTY, NODE2SEQUENCE_SIMPLE, NODE2SEQUENCE_COMPLEX, \ + EDGE2OVERLAP_EMPTY, EDGE2OVERLAP_SIMPLE, EDGE2OVERLAP_COMPLEX + +from tests.io.transcriptome_dicts import \ + TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, \ + TRANSCRIPTOME_COMPLEX_DICT + +class TestBed3ToBed4(unittest.TestCase): + """Tests for exfi.io.bed.bed3_to_bed4""" + + def test_empty(self): + """exfi.io.bed.bed3_to_bed4: empty case""" + observed = bed3_to_bed4(BED3_EMPTY) + self.assertTrue(BED4_EMPTY.equals(observed)) + + def test_simple(self): + """exfi.io.bed.bed3_to_bed4: simple case""" + observed = bed3_to_bed4(BED3_SIMPLE) + self.assertTrue(observed.equals(BED4_SIMPLE)) + + def test_complex(self): + """exfi.io.bed.bed3_to_bed4: complex case""" + observed = bed3_to_bed4(BED3_COMPLEX) + self.assertTrue(observed.equals(BED4_COMPLEX)) + + +class TestBed4ToNode2Coordinates(unittest.TestCase): + """Tests for exfi.io.bed.bed4_to_node2coordinates""" + def test_empty(self): + """exfi.io.bed.bed4_to_node2coordinates: empty case""" + observed = bed4_to_node2coordinates(BED4_EMPTY) + self.assertTrue(observed.equals(NODE2COORDINATES_EMPTY)) + + def test_simple(self): + """exfi.io.bed.bed4_to_node2coordinates: simple case""" + observed = bed4_to_node2coordinates(BED4_SIMPLE) + self.assertTrue(observed.equals(NODE2COORDINATES_SIMPLE)) + + def test_complex(self): + """exfi.io.bed.bed4_to_node2coordinates: complex case""" + observed = bed4_to_node2coordinates(BED4_COMPLEX) + self.assertTrue(observed.equals(NODE2COORDINATES_COMPLEX)) + + +class TestBed4ToPath2Nodes(unittest.TestCase): + """Tests for exfi.io.bed.bed4_to_path2nodes""" + def test_empty(self): + """exfi.io.bed.bed4_to_path2nodes: empty case""" + observed = bed4_to_path2nodes(BED4_EMPTY) + self.assertEqual(observed, PATH2NODES_EMPTY) + + def test_simple(self): + """exfi.io.bed.bed4_to_path2nodes: simple case""" + observed = bed4_to_path2nodes(BED4_SIMPLE) + self.assertEqual(observed, PATH2NODES_SIMPLE) + + def test_complex(self): + """exfi.io.bed.bed4_to_path2nodes: complex case""" + observed = bed4_to_path2nodes(BED4_COMPLEX) + print("Observed:\n", observed) + print("Expected:\n", PATH2NODES_COMPLEX) + self.assertEqual(observed, PATH2NODES_COMPLEX) + + +class TestBed4ToNode2Sequence(unittest.TestCase): + """Tests for exfi.io.bed.bed4_to_node2sequence""" + + def test_empty(self): + """exfi.io.bed.bed4_to_node2sequence: empty case""" + observed = bed4_to_node2sequence(BED4_EMPTY, TRANSCRIPTOME_EMPTY_DICT) + self.assertTrue(observed.equals(NODE2SEQUENCE_EMPTY)) + + def test_simple(self): + """exfi.io.bed.bed4_to_node2sequence: simple case""" + observed = bed4_to_node2sequence(BED4_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) + self.assertTrue(observed.equals(NODE2SEQUENCE_SIMPLE)) + + def test_complex(self): + """exfi.io.bed.bed4_to_node2sequence: complex case""" + observed = bed4_to_node2sequence( + BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT + ) + self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX)) + + +class TestBed4ToEdge2Overlap(unittest.TestCase): + """Tests for exfi.io.bed.bed4_to_edge2overlap""" + + def test_empty(self): + """exfi.io.bed.bed4_to_edge2overlap: empty case""" + observed = bed4_to_edge2overlap(BED4_EMPTY) + self.assertTrue(observed.equals(EDGE2OVERLAP_EMPTY)) + + def test_simple(self): + """exfi.io.bed.bed4_to_edge2overlap: simple case""" + observed = bed4_to_edge2overlap(BED4_SIMPLE) + print("Observed:\n", observed, observed.dtypes) + print("Expected:\n", EDGE2OVERLAP_SIMPLE, EDGE2OVERLAP_SIMPLE.dtypes) + self.assertTrue(observed.equals(EDGE2OVERLAP_SIMPLE)) + + def test_complex(self): + """exfi.io.bed.bed4_to_edge2overlap: complex case""" + observed = bed4_to_edge2overlap(BED4_COMPLEX) + print("Observed:\n", observed, observed.dtypes) + print("Expected:\n", EDGE2OVERLAP_COMPLEX, EDGE2OVERLAP_COMPLEX.dtypes) + self.assertTrue(observed.equals(EDGE2OVERLAP_COMPLEX)) + + +if __name__ == '__main__': + unittest.main() From b2bfa569ab3b720e24af2fd594d9ce006e7a5d69 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Tue, 15 Jan 2019 16:45:11 +0100 Subject: [PATCH 22/45] Tests for exfi.io.bed4_to_gfa1.py --- exfi/io/bed4_to_gfa1.py | 1 + tests/io/complex.gfa | 20 +- tests/io/gfa1.py | 273 ++++++++++++++++++ tests/io/simple.gfa | 2 +- tests/test_build_splice_graph_dict.py | 246 ---------------- tests/test_io/test_bed4_to_gfa1.py | 177 ++++++++++++ .../test_io/test_splice_graph_dict_to_gfa1.py | 168 ----------- 7 files changed, 462 insertions(+), 425 deletions(-) create mode 100644 tests/io/gfa1.py delete mode 100644 tests/test_build_splice_graph_dict.py create mode 100644 tests/test_io/test_bed4_to_gfa1.py delete mode 100644 tests/test_io/test_splice_graph_dict_to_gfa1.py diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 55c72ca..4427ec0 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -65,6 +65,7 @@ def compute_paths(bed4): .drop(columns=["chromStart", "chromEnd"])\ .groupby("chrom", axis=0)\ .aggregate(lambda x: ",".join(x.tolist())) + paths = paths.astype({"name": str}) # It may end up as float paths = paths.reset_index(drop=False) paths["RecordType"] = "P" paths = paths.rename({"chrom": "PathName", "name": "SegmentNames"}, axis=1) diff --git a/tests/io/complex.gfa b/tests/io/complex.gfa index ac25b35..7cc0a18 100644 --- a/tests/io/complex.gfa +++ b/tests/io/complex.gfa @@ -14,19 +14,19 @@ S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCT S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 -L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71G -L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5G +L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N +L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M -L ENSDART00000165342.1:125-304 + ENSDART00000165342.1:317-460 + 13G +L ENSDART00000165342.1:125-304 + ENSDART00000165342.1:317-460 + 13N L ENSDART00000165342.1:317-460 + ENSDART00000165342.1:459-592 + 1M L ENSDART00000165342.1:459-592 + ENSDART00000165342.1:591-650 + 1M L ENSDART00000165342.1:591-650 + ENSDART00000165342.1:645-746 + 5M L ENSDART00000165342.1:645-746 + ENSDART00000165342.1:746-851 + 0M -L ENSDART00000165342.1:746-851 + ENSDART00000165342.1:854-886 + 3G -L ENSDART00000165342.1:854-886 + ENSDART00000165342.1:899-953 + 13G -L ENSDART00000165342.1:899-953 + ENSDART00000165342.1:974-1097 + 21G -L ENSDART00000165342.1:974-1097 + ENSDART00000165342.1:1098-1175 + 1G -L ENSDART00000165342.1:1098-1175 + ENSDART00000165342.1:1176-1324 + 1G +L ENSDART00000165342.1:746-851 + ENSDART00000165342.1:854-886 + 3N +L ENSDART00000165342.1:854-886 + ENSDART00000165342.1:899-953 + 13N +L ENSDART00000165342.1:899-953 + ENSDART00000165342.1:974-1097 + 21N +L ENSDART00000165342.1:974-1097 + ENSDART00000165342.1:1098-1175 + 1N +L ENSDART00000165342.1:1098-1175 + ENSDART00000165342.1:1176-1324 + 1N C ENSDART00000161035.1 + ENSDART00000161035.1:0-326 + 0 326M C ENSDART00000161035.1 + ENSDART00000161035.1:397-472 + 397 75M C ENSDART00000161035.1 + ENSDART00000161035.1:477-523 + 477 46M @@ -42,5 +42,5 @@ C ENSDART00000165342.1 + ENSDART00000165342.1:899-953 + 899 54M C ENSDART00000165342.1 + ENSDART00000165342.1:974-1097 + 974 123M C ENSDART00000165342.1 + ENSDART00000165342.1:1098-1175 + 1098 77M C ENSDART00000165342.1 + ENSDART00000165342.1:1176-1324 + 1176 148M -P ENSDART00000161035.1 ENSDART00000161035.1:0-326+,ENSDART00000161035.1:397-472+,ENSDART00000161035.1:477-523+ -P ENSDART00000165342.1 ENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+,ENSDART00000165342.1:317-460+,ENSDART00000165342.1:459-592+,ENSDART00000165342.1:591-650+,ENSDART00000165342.1:645-746+,ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+,ENSDART00000165342.1:899-953+,ENSDART00000165342.1:974-1097+,ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+ +P ENSDART00000161035.1 ENSDART00000161035.1:0-326+,ENSDART00000161035.1:397-472+,ENSDART00000161035.1:477-523+ * +P ENSDART00000165342.1 ENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+,ENSDART00000165342.1:317-460+,ENSDART00000165342.1:459-592+,ENSDART00000165342.1:591-650+,ENSDART00000165342.1:645-746+,ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+,ENSDART00000165342.1:899-953+,ENSDART00000165342.1:974-1097+,ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+ * diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py new file mode 100644 index 0000000..1128d5b --- /dev/null +++ b/tests/io/gfa1.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 + +"""tests.io.gfa1.py: Fragments of GFA1 files""" + +import pandas as pd + +HEADER = pd.DataFrame( + data=[["H", "VN:Z:1.0"]], + columns=["RecordType", "Version"] +) + + + +SEGMENTS_EMPTY = pd.DataFrame( + columns=["RecordType", "name", "sequence", "SegmentLength"] +) + +SEGMENTS_SIMPLE = pd.DataFrame( + data=[[ + "S", + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", + "LN:i:326" + ]], + columns=["RecordType", "name", "sequence", "SegmentLength"] +) + +SEGMENTS_COMPLEX = pd.DataFrame( + data=[[ + "S", + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", + "LN:i:326", + ], [ + "S", + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" + "CAACA", + "LN:i:75", + ], [ + "S", + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + ], [ + "S", + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG", "LN:i:122" + ], [ + "S", + "ENSDART00000165342.1:125-304", + "AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" + "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + ], [ + "S", + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" + "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" + "CAG", "LN:i:143" + ], [ + "S", + "ENSDART00000165342.1:459-592", + "GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA", + "LN:i:133" + ], [ + "S", + "ENSDART00000165342.1:591-650", + "AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT", + "LN:i:59" + ], [ + "S", + "ENSDART00000165342.1:645-746", + "GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", + "LN:i:101", + ], [ + "S", + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + ], [ + "S", + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + "LN:i:32" + ], [ + "S", + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + ], [ + "S", + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", + "LN:i:123" + ], [ + "S", + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" + "ATCCTGA", + "LN:i:77", + ], [ + "S", + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" + "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" + "CCACAGCG", + "LN:i:148" + ]], + columns=["RecordType", "name", "sequence", "SegmentLength"] +) + + + +LINKS_EMPTY = pd.DataFrame( + columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] +) + +LINKS_SIMPLE = pd.DataFrame( + columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] +) + +LINKS_COMPLEX = pd.DataFrame( + data=[[ + "L", "ENSDART00000161035.1:0-326", "+", + "ENSDART00000161035.1:397-472", "+", "71N" + ], [ + "L", "ENSDART00000161035.1:397-472", "+", + "ENSDART00000161035.1:477-523", "+", "5N" + ], [ + "L", "ENSDART00000165342.1:5-127", "+", + "ENSDART00000165342.1:125-304", "+", "2M" + ], [ + "L", "ENSDART00000165342.1:125-304", "+", + "ENSDART00000165342.1:317-460", "+", "13N" + ], [ + "L", "ENSDART00000165342.1:317-460", "+", + "ENSDART00000165342.1:459-592", "+", "1M" + ], [ + "L", "ENSDART00000165342.1:459-592", "+", + "ENSDART00000165342.1:591-650", "+", "1M" + ], [ + "L", "ENSDART00000165342.1:591-650", "+", + "ENSDART00000165342.1:645-746", "+", "5M" + ], [ + "L", "ENSDART00000165342.1:645-746", "+", + "ENSDART00000165342.1:746-851", "+", "0M" + ], [ + "L", "ENSDART00000165342.1:746-851", "+", + "ENSDART00000165342.1:854-886", "+", "3N" + ], [ + "L", "ENSDART00000165342.1:854-886", "+", + "ENSDART00000165342.1:899-953", "+", "13N" + ], [ + "L", "ENSDART00000165342.1:899-953", "+", + "ENSDART00000165342.1:974-1097", "+", "21N" + ], [ + "L", "ENSDART00000165342.1:974-1097", "+", + "ENSDART00000165342.1:1098-1175", "+", "1N" + ], [ + "L", "ENSDART00000165342.1:1098-1175", "+", + "ENSDART00000165342.1:1176-1324", "+", "1N" + ]], + columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] +) + + + +CONTAINMENTS_EMPTY = pd.DataFrame( + columns=["RecordType", "Container", "ContainerOrient", "Contained", + "ContainedOrient", "Pos", "Overlap"] +) + +CONTAINMENTS_SIMPLE = pd.DataFrame( + data=[[ + "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:0-326", "+", + 0, "326M" + ]], + columns=["RecordType", "Container", "ContainerOrient", "Contained", + "ContainedOrient", "Pos", "Overlap"] +) +CONTAINMENTS_COMPLEX = pd.DataFrame( + data=[[ + "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:0-326", "+", + 0, "326M" + ], [ + "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:397-472", "+", + 397, "75M" + ], [ + "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:477-523", "+", + 477, "46M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:5-127", "+", + 5, "122M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:125-304", "+", + 125, "179M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:317-460", "+", + 317, "143M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:459-592", "+", + 459, "133M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:591-650", "+", + 591, "59M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:645-746", "+", + 645, "101M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:746-851", "+", + 746, "105M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:854-886", "+", + 854, "32M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:899-953", "+", + 899, "54M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:974-1097", "+", + 974, "123M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:1098-1175", + "+", 1098, "77M" + ], [ + "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:1176-1324", + "+", 1176, "148M" + ]], + columns=["RecordType", "Container", "ContainerOrient", "Contained", + "ContainedOrient", "Pos", "Overlap"] +) + + +PATHS_EMPTY = pd.DataFrame( + columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] +) +PATHS_SIMPLE = pd.DataFrame( + data=[["P", "ENSDART00000161035.1", "ENSDART00000161035.1:0-326+", "*"]], + columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] +) +PATHS_COMPLEX = pd.DataFrame( + data=[[ + "P", "ENSDART00000161035.1", + "ENSDART00000161035.1:0-326+," + "ENSDART00000161035.1:397-472+," + "ENSDART00000161035.1:477-523+", + "*" + ], [ + "P", "ENSDART00000165342.1", + "ENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+," + "ENSDART00000165342.1:317-460+,ENSDART00000165342.1:459-592+," + "ENSDART00000165342.1:591-650+,ENSDART00000165342.1:645-746+," + "ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+," + "ENSDART00000165342.1:899-953+,ENSDART00000165342.1:974-1097+," + "ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+", + "*" + ]], + columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] +) + +GFA1_EMPTY_FN = "tests/io/empty.gfa" +GFA1_SIMPLE_FN = "tests/io/simple.gfa" +GFA1_COMPLEX_FN = "tests/io/complex.gfa" diff --git a/tests/io/simple.gfa b/tests/io/simple.gfa index f57b66f..488ea37 100644 --- a/tests/io/simple.gfa +++ b/tests/io/simple.gfa @@ -1,4 +1,4 @@ H VN:Z:1.0 S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 C ENSDART00000161035.1 + ENSDART00000161035.1:0-326 + 0 326M -P ENSDART00000161035.1 ENSDART00000161035.1:0-326+ +P ENSDART00000161035.1 ENSDART00000161035.1:0-326+ * diff --git a/tests/test_build_splice_graph_dict.py b/tests/test_build_splice_graph_dict.py deleted file mode 100644 index 3f86c49..0000000 --- a/tests/test_build_splice_graph_dict.py +++ /dev/null @@ -1,246 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.build_splice_graph -""" - -from typing import Iterable - -import unittest - -import networkx as nx - -from exfi.classes import Coordinate, SpliceGraph, \ - SpliceGraphDict - -from exfi.build_splice_graph_dict import \ - _bed3_to_str, \ - bed3_records_to_bed6df_dict, \ - bed6df_to_path2node, \ - bed6df_to_node2coordinates, \ - compute_edge_overlaps, \ - build_splice_graph, \ - build_splice_graph_dict - -from tests.custom_assertions import \ - CustomAssertions - -from tests.data import \ - BED3RECORDS_EMPTY, BED3RECORDS_SIMPLE, BED3RECORDS_COMPLEX, \ - BED6DF_EMPTY, BED6DF_SIMPLE, BED6DF_COMPLEX, \ - BED6DF_DICT_EMPTY, BED6DF_DICT_SIMPLE, BED6DF_DICT_COMPLEX, \ - PATH_EMPTY, PATH_SIMPLE, PATH_COMPLEX, \ - NODE2COORDS_EMPTY, NODE2COORDS_SIMPLE, NODE2COORDS_COMPLEX, \ - OVERLAPS_EMPTY_DICT, OVERLAPS_SIMPLE_DICT, OVERLAPS_COMPLEX_DICT, \ - SPLICE_GRAPH_EMPTY, SPLICE_GRAPH_SIMPLE, SPLICE_GRAPH_COMPLEX, \ - SPLICE_GRAPH_EMPTY_DICT, SPLICE_GRAPH_SIMPLE_DICT, SPLICE_GRAPH_COMPLEX_DICT - -BED3_COLS = ['chrom', 'start', 'end'] -BED6_COLS = ['chrom', 'start', 'end', 'name', 'score', 'strand'] - -ARGS = {"threads": 4} - - -def _prepare_overlaps(bed3_records: Iterable[Coordinate]) -> SpliceGraphDict: - """Compute splicegraph prior the computation of overlaps""" - sg_dict = SpliceGraphDict() - - bed6df_dict = bed3_records_to_bed6df_dict(bed3_records) - for transcript, bed6_df in bed6df_dict.items(): - splice_graph = SpliceGraph() - splice_graph.add_nodes_from(bed6_df["name"].tolist()) - node2coords = bed6df_to_node2coordinates(bed6_df) - nx.set_node_attributes(G=splice_graph, name="coordinates", values=node2coords) - transcript2path = bed6df_to_path2node(bed6_df) - for path in transcript2path.values(): - splice_graph.add_path(path) - sg_dict[transcript] = splice_graph - return sg_dict - - - -class TestBed3ToStr(unittest.TestCase): - """Tests for _bed3_to_str""" - - def test_empty(self): - """exfi.build_splice_graph_dict._bed3_to_str: empty record""" - with self.assertRaises(IndexError): - _bed3_to_str([]) - - def test_malformed1(self): - """exfi.build_splice_graph_dict._bed3_to_str: record of 2 elements""" - with self.assertRaises(IndexError): - _bed3_to_str((0, 1)) - - def test_malformed2(self): - """exfi.build_splice_graph_dict._bed3_to_str: record of 4 elements""" - with self.assertRaises(IndexError): - _bed3_to_str((0, 1, 2, 3)) - - def test_record(self): - """exfi.build_splice_graph_dict._bed3_to_str: correct record""" - self.assertEqual( - _bed3_to_str(("tr", 10, 15)), - "tr:10-15" - ) - - - -class TestBed3RecordsToBed6DFDict(unittest.TestCase, CustomAssertions): - """Tests for bed3_records_to_bed6df_dict""" - - def test_empty_index(self): - """exfi.build_splice_graph_dict.bed3_records_to_bed6df_dict: empty exome""" - actual = bed3_records_to_bed6df_dict(BED3RECORDS_EMPTY) - expected = BED6DF_DICT_EMPTY - self.assertEqualDictOfDF(actual, expected) - - def test_one_entry(self): - """exfi.build_splice_graph_dict.bed3_records_to_bed6df_dict: single exon""" - actual = bed3_records_to_bed6df_dict(BED3RECORDS_SIMPLE) - expected = BED6DF_DICT_SIMPLE - self.assertEqualDictOfDF(actual, expected) - - def test_multiple(self): - """exfi.build_splice_graph_dict.bed3_records_to_bed6df_dict: multiple transcripts - multiple - exons""" - actual = bed3_records_to_bed6df_dict(BED3RECORDS_COMPLEX) - expected = BED6DF_DICT_COMPLEX - self.assertEqualDictOfDF(actual, expected) - - - -class TestBed6DFToPath2Node(unittest.TestCase): - """Tests for bed6df_to_path2node""" - - def test_empty(self): - """exfi.build_splice_graph_dict.bed6df_to_path2node: convert an empty exome to path""" - actual = bed6df_to_path2node(BED6DF_EMPTY) - expected = PATH_EMPTY - self.assertEqual(actual, expected) - - def test_single(self): - """exfi.build_splice_graph_dict.bed6df_to_path2node: convert a single exon transcript to - path""" - actual = bed6df_to_path2node(BED6DF_SIMPLE) - expected = PATH_SIMPLE - self.assertEqual(actual, expected) - - def test_multiple(self): - """exfi.build_splice_graph_dict.bed6df_to_path2node: convert a single exon transcript to - path""" - actual = bed6df_to_path2node(BED6DF_COMPLEX) - expected = PATH_COMPLEX - self.assertEqual(actual, expected) - - - -class TestBed6ToNode2Coord(unittest.TestCase): - """Tests for bed6df_to_path2node""" - - def test_empty(self): - """exfi.build_splice_graph_dict.bed6df_to_node2coordinates: empty records""" - actual = bed6df_to_node2coordinates(BED6DF_EMPTY) - expected = NODE2COORDS_EMPTY - self.assertEqual(actual, expected) - - def test_simple(self): - """exfi.build_splice_graph_dict.bed6df_to_node2coordinates: single node""" - actual = bed6df_to_node2coordinates(BED6DF_SIMPLE) - expected = NODE2COORDS_SIMPLE - self.assertEqual(actual, expected) - - def test_complex(self): - """exfi.build_splice_graph_dict.bed6df_to_node2coordinates: complex case""" - actual = bed6df_to_node2coordinates(BED6DF_COMPLEX) - expected = NODE2COORDS_COMPLEX - self.assertEqual(actual, expected) - - - -class TestComputeEdgeOverlaps(unittest.TestCase): - """Tests for compute_edge_overlaps""" - - def test_empty_exome(self): - """exfi.build_splice_graph_dict.compute_overlaps: compute the overlaps of an empty exome""" - splice_graph_dict = _prepare_overlaps(BED3RECORDS_EMPTY) - overlaps_dict = { - transcript: compute_edge_overlaps(splice_graph) - for transcript, splice_graph in splice_graph_dict.items() - } - self.assertEqual(overlaps_dict, OVERLAPS_EMPTY_DICT) - - def test_single_exon(self): - """exfi.build_splice_graph_dict.compute_overlaps: compute the overlaps of a single exon - exome""" - splice_graph_dict = _prepare_overlaps(BED3RECORDS_SIMPLE) - overlaps_dict = { - transcript: compute_edge_overlaps(splice_graph) - for transcript, splice_graph in splice_graph_dict.items() - } - self.assertEqual(overlaps_dict, OVERLAPS_SIMPLE_DICT) - - def test_multiple_exons(self): - """exfi.build_splice_graph_dict.compute_overlaps: compute the overlaps of a simple exome""" - splice_graph_dict = _prepare_overlaps(BED3RECORDS_COMPLEX) - overlaps_dict = { - transcript: compute_edge_overlaps(splice_graph) - for transcript, splice_graph in splice_graph_dict.items() - } - self.assertEqual(overlaps_dict, OVERLAPS_COMPLEX_DICT) - - - -class TestBuildSpliceGraph(unittest.TestCase, CustomAssertions): - """Tests for exfi.build_splice_graph_dict._build_splice_graph""" - - def test_empty(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of an - empty set of exons""" - actual = build_splice_graph(bed6df=BED6DF_EMPTY) - expected = SPLICE_GRAPH_EMPTY - self.assertEqualSpliceGraphs(actual, expected) - - def test_simple(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of a - single exon""" - actual = build_splice_graph(bed6df=BED6DF_SIMPLE) - expected = SPLICE_GRAPH_SIMPLE - self.assertEqualSpliceGraphs(actual, expected) - - def test_multiple(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of a set - of exons""" - actual = build_splice_graph(bed6df=BED6DF_COMPLEX) - expected = SPLICE_GRAPH_COMPLEX - self.assertEqualSpliceGraphs(actual, expected) - - -class TestBuildSpliceGraphDict(unittest.TestCase, CustomAssertions): - """Tests for build_splice_graph_dict""" - - def test_empty(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of an - empty set of exons""" - actual = build_splice_graph_dict(bed3records=BED3RECORDS_EMPTY, args=ARGS) - expected = SPLICE_GRAPH_EMPTY_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - def test_simple(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of a - single exon""" - actual = build_splice_graph_dict(bed3records=BED3RECORDS_SIMPLE, args=ARGS) - expected = SPLICE_GRAPH_SIMPLE_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - def test_multiple(self): - """exfi.build_splice_graph_dict.build_splice_graph_dict: compute the splice graph of a set - of exons""" - actual = build_splice_graph_dict(bed3records=BED3RECORDS_COMPLEX, args=ARGS) - expected = SPLICE_GRAPH_COMPLEX_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/test_io/test_bed4_to_gfa1.py b/tests/test_io/test_bed4_to_gfa1.py new file mode 100644 index 0000000..ecc193d --- /dev/null +++ b/tests/test_io/test_bed4_to_gfa1.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +"""tests.test_io.test_bed4_to_gfa1.py: tests for exfi.io.bed4_to_gfa1.py""" + +from unittest import TestCase, main + +from tempfile import mkstemp +import os +import filecmp + +from exfi.io.bed4_to_gfa1 import \ + compute_header, \ + compute_segments, \ + compute_links, \ + compute_containments, \ + compute_paths, \ + bed4_to_gfa1 + +from tests.io.bed import \ + BED4_EMPTY, BED4_SIMPLE, BED4_COMPLEX + +from tests.io.transcriptome_dicts import \ + TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, \ + TRANSCRIPTOME_COMPLEX_DICT + +from tests.io.gfa1 import \ + HEADER, \ + SEGMENTS_EMPTY, SEGMENTS_SIMPLE, SEGMENTS_COMPLEX, \ + LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ + CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ + PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX, \ + GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN + + + +class TestComputeHeader(TestCase): + """Tests for exfi.io.bed4_to_gfa1.compute_header""" + + def test_header(self): + """exfi.io.bed4_to_gfa1.compute_header: single test""" + observed = compute_header() + self.assertTrue(observed.equals(HEADER)) + + + +class TestComputeSegments(TestCase): + """Tests for exfi.io.bed4_to_gfa1.compute_segments""" + + def test_empty(self): + """exfi.io.bed4_to_gfa1.compute_segments: empty case""" + observed = compute_segments(BED4_EMPTY, TRANSCRIPTOME_EMPTY_DICT) + self.assertTrue(observed.equals(SEGMENTS_EMPTY)) + + def test_simple(self): + """exfi.io.bed4_to_gfa1.compute_segments: simple case""" + observed = compute_segments(BED4_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) + self.assertTrue(observed.equals(SEGMENTS_SIMPLE)) + + def test_complex(self): + """exfi.io.bed4_to_gfa1.compute_segments: complex case""" + observed = compute_segments(BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) + self.assertTrue(observed.equals(SEGMENTS_COMPLEX)) + + +class TestComputeLinks(TestCase): + """Tests for exfi.io.bed4_to_gfa1.compute_links""" + + def test_empty(self): + """exfi.io.bed4_to_gfa1.compute_links: empty case""" + observed = compute_links(BED4_EMPTY) + self.assertTrue(observed.equals(LINKS_EMPTY)) + + def test_simple(self): + """exfi.io.bed4_to_gfa1.compute_links: simple case""" + observed = compute_links(BED4_SIMPLE) + self.assertTrue(observed.equals(LINKS_SIMPLE)) + + def test_complex(self): + """exfi.io.bed4_to_gfa1.compute_links: complex case""" + observed = compute_links(BED4_COMPLEX) + # print("Observed", observed, observed.dtypes, sep="\n") + # print("Expected", LINKS_COMPLEX, LINKS_COMPLEX.dtypes, sep="\n") + self.assertTrue(observed.equals(LINKS_COMPLEX)) + + + +class TestComputeContainments(TestCase): + """Tests for exfi.io.bed4_to_gfa1.compute_containments""" + + def test_empty(self): + """exfi.io.bed4_to_gfa1.compute_containments: empty case""" + observed = compute_containments(BED4_EMPTY) + self.assertTrue(observed.equals(CONTAINMENTS_EMPTY)) + + def test_simple(self): + """exfi.io.bed4_to_gfa1.compute_containments: simple case""" + observed = compute_containments(BED4_SIMPLE) + # print("Observed", observed, observed.dtypes, sep="\n") + # print("Expected", CONTAINMENTS_SIMPLE, CONTAINMENTS_SIMPLE.dtypes, sep="\n") + self.assertTrue(observed.equals(CONTAINMENTS_SIMPLE)) + + def test_complex(self): + """exfi.io.bed4_to_gfa1.compute_containments: complex case""" + observed = compute_containments(BED4_COMPLEX) + # print("Observed", observed, observed.dtypes, sep="\n") + # print("Expected", CONTAINMENTS_COMPLEX, CONTAINMENTS_COMPLEX.dtypes, sep="\n") + self.assertTrue(observed.equals(CONTAINMENTS_COMPLEX)) + + + +class TestComputePaths(TestCase): + """Tests for exfi.io.bed4_to_gfa1.compute_paths""" + + def test_empty(self): + """exfi.io.bed4_to_gfa1.compute_paths: empty case""" + observed = compute_paths(BED4_EMPTY) + print("Observed", observed, observed.dtypes, sep="\n") + print("Expected", PATHS_EMPTY, PATHS_EMPTY.dtypes, sep="\n") + self.assertTrue(observed.equals(PATHS_EMPTY)) + + def test_simple(self): + """exfi.io.bed4_to_gfa1.compute_paths: simple case""" + observed = compute_paths(BED4_SIMPLE) + print("Observed", observed, observed.dtypes, sep="\n") + print("Expected", PATHS_SIMPLE, PATHS_SIMPLE.dtypes, sep="\n") + self.assertTrue(observed.equals(PATHS_SIMPLE)) + + def test_complex(self): + """exfi.io.bed4_to_gfa1.compute_paths: complex case""" + observed = compute_paths(BED4_COMPLEX) + print("Observed", observed, observed.dtypes, sep="\n") + print("Expected", PATHS_COMPLEX, PATHS_COMPLEX.dtypes, sep="\n") + self.assertTrue(observed.equals(PATHS_COMPLEX)) + + + +class TestBED4TOGFA1(TestCase): + """Tests for exfi.io.bed4_to_gfa1.bed4_to_gfa1""" + + def test_empty(self): + """exfi.io.bed4_to_gfa1.bed4_to_gfa1: empty case""" + tmp_file = mkstemp()[1] + print(tmp_file) + bed4_to_gfa1( + gfa1_fn=tmp_file, + bed4=BED4_EMPTY, + transcriptome_dict=TRANSCRIPTOME_EMPTY_DICT + ) + self.assertTrue(filecmp.cmp(tmp_file, GFA1_EMPTY_FN)) + os.remove(tmp_file) + + def test_simple(self): + """exfi.io.bed4_to_gfa1.bed4_to_gfa1: simple case""" + tmp_file = mkstemp()[1] + print(tmp_file) + bed4_to_gfa1( + gfa1_fn=tmp_file, + bed4=BED4_SIMPLE, + transcriptome_dict=TRANSCRIPTOME_SIMPLE_DICT + ) + self.assertTrue(filecmp.cmp(tmp_file, GFA1_SIMPLE_FN)) + os.remove(tmp_file) + + def test_complex(self): + """exfi.io.bed4_to_gfa1.bed4_to_gfa1: complex case""" + tmp_file = mkstemp()[1] + print(tmp_file) + bed4_to_gfa1( + gfa1_fn=tmp_file, + bed4=BED4_COMPLEX, + transcriptome_dict=TRANSCRIPTOME_COMPLEX_DICT + ) + self.assertTrue(filecmp.cmp(tmp_file, GFA1_COMPLEX_FN)) + os.remove(tmp_file) + +if __name__ == '__main__': + main() diff --git a/tests/test_io/test_splice_graph_dict_to_gfa1.py b/tests/test_io/test_splice_graph_dict_to_gfa1.py deleted file mode 100644 index 1a96ca9..0000000 --- a/tests/test_io/test_splice_graph_dict_to_gfa1.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.io.gfa1_to_exons -""" - -from unittest import TestCase, main - -import filecmp -import tempfile -import os - -from exfi.io.splice_graph_dict_to_gfa1 import \ - _compute_segments, \ - _compute_links, \ - _compute_containments, \ - _compute_paths, \ - splice_graph_dict_to_gfa1 - -from tests.data import \ - TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, TRANSCRIPTOME_COMPLEX_DICT, \ - GFA_EMPTY_FN, GFA_SIMPLE_FN, GFA_COMPLEX_FN, \ - SPLICE_GRAPH_EMPTY_DICT, SPLICE_GRAPH_SIMPLE_DICT, SPLICE_GRAPH_COMPLEX_DICT, \ - SEGMENTS_EMPTY, SEGMENTS_SIMPLE, SEGMENTS_COMPLEX, \ - LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ - CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ - PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX - - - -class TestComputeSegments(TestCase): - """Tests for _compute_segments""" - - def test_empty(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_segments: empty case""" - actual = list(_compute_segments(SPLICE_GRAPH_EMPTY_DICT, TRANSCRIPTOME_EMPTY_DICT)) - expected = SEGMENTS_EMPTY - self.assertEqual(actual, expected) - - def test_simple(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_segments: simple case""" - actual = list(_compute_segments(SPLICE_GRAPH_SIMPLE_DICT, TRANSCRIPTOME_SIMPLE_DICT)) - expected = SEGMENTS_SIMPLE - self.assertEqual(actual, expected) - - def test_complex(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_segments: complex case""" - actual = list(_compute_segments(SPLICE_GRAPH_COMPLEX_DICT, TRANSCRIPTOME_COMPLEX_DICT)) - expected = SEGMENTS_COMPLEX - self.assertEqual(actual, expected) - - - -class TestComputeLinks(TestCase): - """Tests for _compute_links""" - - def test_empty(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_links: empty case""" - actual = list(_compute_links(SPLICE_GRAPH_EMPTY_DICT)) - expected = LINKS_EMPTY - self.assertEqual(actual, expected) - - def test_simple(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_links: simple case""" - actual = list(_compute_links(SPLICE_GRAPH_SIMPLE_DICT)) - expected = LINKS_SIMPLE - self.assertEqual(actual, expected) - - def test_coplex(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_links: complex case""" - actual = list(_compute_links(SPLICE_GRAPH_COMPLEX_DICT)) - expected = LINKS_COMPLEX - self.assertEqual(actual, expected) - - - -class TestComputeContainments(TestCase): - """Tests for _compute_containments""" - - def test_empty(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_containments: empty case""" - actual = list(_compute_containments(SPLICE_GRAPH_EMPTY_DICT)) - expected = CONTAINMENTS_EMPTY - self.assertEqual(actual, expected) - - def test_simple(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_containments: simple case""" - actual = list(_compute_containments(SPLICE_GRAPH_SIMPLE_DICT)) - expected = CONTAINMENTS_SIMPLE - self.assertEqual(actual, expected) - - def test_coplex(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_containments: complex case""" - actual = list(_compute_containments(SPLICE_GRAPH_COMPLEX_DICT)) - expected = CONTAINMENTS_COMPLEX - self.assertEqual(actual, expected) - - - -class TestComputePaths(TestCase): - """Tests for exfi.io.splice_graph_dict_to_gfa1._compute_paths""" - - def test_empty(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_paths: empty case""" - actual = list(_compute_paths(SPLICE_GRAPH_EMPTY_DICT)) - expected = PATHS_EMPTY - self.assertEqual(actual, expected) - - def test_simple(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_paths: simple case""" - actual = list(_compute_paths(SPLICE_GRAPH_SIMPLE_DICT)) - expected = PATHS_SIMPLE - self.assertEqual(actual, expected) - - def test_complex(self): - """exfi.io.splice_graph_dict_to_gfa1._compute_paths: complex case""" - actual = list(_compute_paths(SPLICE_GRAPH_COMPLEX_DICT)) - expected = PATHS_COMPLEX - self.assertEqual(actual, expected) - - -class TestSpliceGraphToGFA1(TestCase): - """Tests for exfi.io.splice_graph_dict_to_gfa1.splice_graph_dict_to_gfa1""" - - def test_empty(self): - """exfi.io.splice_graph_dict_to_gfa1.splice_graph_dict_to_gfa1: empty case""" - tmp_file = tempfile.mkstemp()[1] - splice_graph_dict_to_gfa1( - splice_graph_dict=SPLICE_GRAPH_EMPTY_DICT, - transcriptome_dict=TRANSCRIPTOME_EMPTY_DICT, - filename=tmp_file - ) - self.assertTrue(filecmp.cmp( - tmp_file, - GFA_EMPTY_FN - )) - os.remove(tmp_file) - - def test_simple(self): - """exfi.io.splice_graph_dict_to_gfa1.splice_graph_dict_to_gfa1: simple case""" - tmp_file = tempfile.mkstemp()[1] - splice_graph_dict_to_gfa1( - splice_graph_dict=SPLICE_GRAPH_SIMPLE_DICT, - transcriptome_dict=TRANSCRIPTOME_SIMPLE_DICT, - filename=tmp_file #tmp_file - ) - self.assertTrue(filecmp.cmp( - tmp_file, - GFA_SIMPLE_FN - )) - os.remove(tmp_file) - - def test_multiple(self): - """exfi.io.splice_graph_dict_to_gfa1.splice_graph_dict_to_gfa1: complex case""" - tmp_file = tempfile.mkstemp()[1] - splice_graph_dict_to_gfa1( - splice_graph_dict=SPLICE_GRAPH_COMPLEX_DICT, - transcriptome_dict=TRANSCRIPTOME_COMPLEX_DICT, - filename=tmp_file - ) - self.assertTrue(filecmp.cmp( - tmp_file, - GFA_COMPLEX_FN - )) - os.remove(tmp_file) - -if __name__ == "__main__": - main() From 1340ff5bdd604c491460912a8f5cf6fbd107e0ff Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Tue, 15 Jan 2019 17:24:39 +0100 Subject: [PATCH 23/45] Tests for exfi.io.gfa1_to_bed.py and bugfixes --- exfi/io/gfa1_to_bed.py | 5 +++- tests/io/bed.py | 15 ++++++++++- tests/io/gfa1.py | 2 ++ tests/test_io/test_bed.py | 9 +++++++ tests/test_io/test_gfa1_to_bed.py | 42 +++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 tests/test_io/test_gfa1_to_bed.py diff --git a/exfi/io/gfa1_to_bed.py b/exfi/io/gfa1_to_bed.py index 902037d..658c539 100644 --- a/exfi/io/gfa1_to_bed.py +++ b/exfi/io/gfa1_to_bed.py @@ -26,8 +26,11 @@ def gfa1_to_bed4(filename): "Contained": "name" }, axis=1) containments["Overlap"] = containments["Overlap"]\ - .map(lambda x: int(x[:-1])) + .map(lambda x: np.int(x[:-1])) containments["chromStart"] = containments["Pos"] containments["chromEnd"] = containments["Pos"] + containments["Overlap"] containments = containments[["chrom", "chromStart", "chromEnd", "name"]] + containments = containments.astype( + {"chromStart": np.int64, "chromEnd": np.int64} + ) return containments diff --git a/tests/io/bed.py b/tests/io/bed.py index 7e88d1f..c3f6445 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -9,6 +9,9 @@ data=None, columns=["chrom", "chromStart", "chromEnd"] ) +BED3_EMPTY = BED3_EMPTY.astype( + {"chrom": str, "chromStart": np.int64, "chromEnd": np.int64} +) BED3_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326)], @@ -39,9 +42,14 @@ BED4_EMPTY = pd.DataFrame( - data=None, columns=["chrom", "chromStart", "chromEnd", "name"] + data=None, + columns=["chrom", "chromStart", "chromEnd", "name"] +) +BED4_EMPTY = BED4_EMPTY.astype( + {"chrom": str, "chromStart": np.int64, "chromEnd": np.int64, "name": str} ) + BED4_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], columns=["chrom", "chromStart", "chromEnd", "name"] @@ -75,6 +83,10 @@ data=None, columns=["chrom", "chromStart", "chromEnd", "name"] ).set_index("name") +NODE2COORDINATES_EMPTY = NODE2COORDINATES_EMPTY.astype( + {"chromStart": np.int64, "chromEnd": np.int64} +) + NODE2COORDINATES_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], @@ -214,6 +226,7 @@ EDGE2OVERLAP_EMPTY = pd.DataFrame(columns=["u", "v", "overlap"]) +EDGE2OVERLAP_EMPTY = EDGE2OVERLAP_EMPTY.astype({"overlap": np.int64}) EDGE2OVERLAP_SIMPLE = pd.DataFrame(columns=["u", "v", "overlap"]) EDGE2OVERLAP_SIMPLE = EDGE2OVERLAP_SIMPLE.astype({"overlap": np.int64}) diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index 1128d5b..f8139c9 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -3,6 +3,7 @@ """tests.io.gfa1.py: Fragments of GFA1 files""" import pandas as pd +import numpy as np HEADER = pd.DataFrame( data=[["H", "VN:Z:1.0"]], @@ -180,6 +181,7 @@ columns=["RecordType", "Container", "ContainerOrient", "Contained", "ContainedOrient", "Pos", "Overlap"] ) +CONTAINMENTS_EMPTY = CONTAINMENTS_EMPTY.astype({"Overlap": np.int64}) CONTAINMENTS_SIMPLE = pd.DataFrame( data=[[ diff --git a/tests/test_io/test_bed.py b/tests/test_io/test_bed.py index 0d053b9..db6768f 100644 --- a/tests/test_io/test_bed.py +++ b/tests/test_io/test_bed.py @@ -23,6 +23,8 @@ TRANSCRIPTOME_EMPTY_DICT, TRANSCRIPTOME_SIMPLE_DICT, \ TRANSCRIPTOME_COMPLEX_DICT + + class TestBed3ToBed4(unittest.TestCase): """Tests for exfi.io.bed.bed3_to_bed4""" @@ -42,6 +44,7 @@ def test_complex(self): self.assertTrue(observed.equals(BED4_COMPLEX)) + class TestBed4ToNode2Coordinates(unittest.TestCase): """Tests for exfi.io.bed.bed4_to_node2coordinates""" def test_empty(self): @@ -60,6 +63,7 @@ def test_complex(self): self.assertTrue(observed.equals(NODE2COORDINATES_COMPLEX)) + class TestBed4ToPath2Nodes(unittest.TestCase): """Tests for exfi.io.bed.bed4_to_path2nodes""" def test_empty(self): @@ -80,6 +84,7 @@ def test_complex(self): self.assertEqual(observed, PATH2NODES_COMPLEX) + class TestBed4ToNode2Sequence(unittest.TestCase): """Tests for exfi.io.bed.bed4_to_node2sequence""" @@ -101,12 +106,15 @@ def test_complex(self): self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX)) + class TestBed4ToEdge2Overlap(unittest.TestCase): """Tests for exfi.io.bed.bed4_to_edge2overlap""" def test_empty(self): """exfi.io.bed.bed4_to_edge2overlap: empty case""" observed = bed4_to_edge2overlap(BED4_EMPTY) + print("Observed:\n", observed, observed.dtypes) + print("Expected:\n", EDGE2OVERLAP_EMPTY, EDGE2OVERLAP_EMPTY.dtypes) self.assertTrue(observed.equals(EDGE2OVERLAP_EMPTY)) def test_simple(self): @@ -124,5 +132,6 @@ def test_complex(self): self.assertTrue(observed.equals(EDGE2OVERLAP_COMPLEX)) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_io/test_gfa1_to_bed.py b/tests/test_io/test_gfa1_to_bed.py new file mode 100644 index 0000000..44e36cd --- /dev/null +++ b/tests/test_io/test_gfa1_to_bed.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +"""tests.test_io.test_gfa1_to_bed.py: tests for exfi.io.gfa1_to_bed.py""" + + +from unittest import TestCase, main + +from exfi.io.gfa1_to_bed import \ + gfa1_to_bed4 + +from tests.io.gfa1 import \ + GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN + +from tests.io.bed import \ + BED4_EMPTY, BED4_SIMPLE, BED4_COMPLEX + + + +class TestGFA1ToBED4(TestCase): + """Tests for exfi.io.gfa1_to_bed.gfa1_to_bed4""" + + def test_empty(self): + '''exfi.io.gfa1_to_bed.gfa1_to_bed4: empty case''' + observed = gfa1_to_bed4(GFA1_EMPTY_FN) + print("Observed", observed, observed.dtypes, sep="\n") + print("Expected", BED4_EMPTY, BED4_EMPTY.dtypes, sep="\n") + self.assertTrue(observed.equals(BED4_EMPTY)) + + def test_simple(self): + '''exfi.io.gfa1_to_bed.gfa1_to_bed4: simple case''' + observed = gfa1_to_bed4(GFA1_SIMPLE_FN) + self.assertTrue(observed.equals(BED4_SIMPLE)) + + def test_complex(self): + '''exfi.io.gfa1_to_bed.gfa1_to_bed4: complex case''' + observed = gfa1_to_bed4(GFA1_COMPLEX_FN) + self.assertTrue(observed.equals(BED4_COMPLEX)) + + + +if __name__ == '__main__': + main() From 2458ae37b50034950a260e46b85c663d3a8c3d9f Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Tue, 15 Jan 2019 18:07:22 +0100 Subject: [PATCH 24/45] Tests for exfi.io.gfa1_to_fasta.py --- exfi/io/gfa1_to_fasta.py | 11 +++- tests/io/exons_complex.fa | 43 ++++--------- tests/io/exons_complex_hard.fa | 43 ++++--------- tests/io/exons_complex_soft.fa | 43 ++++--------- tests/io/exons_simple.fa | 7 +-- tests/io/fasta.py | 15 +++++ tests/io/gapped_complex.fa | 53 +--------------- tests/io/gapped_complex_soft.fa | 53 +--------------- tests/io/gapped_simple.fa | 7 +-- tests/test_io/test_bed4_to_gfa1.py | 2 + tests/test_io/test_gfa1_to_fasta.py | 98 +++++++++++++++++++++++++++++ 11 files changed, 163 insertions(+), 212 deletions(-) create mode 100644 tests/io/fasta.py create mode 100644 tests/test_io/test_gfa1_to_fasta.py diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index fe2e44b..dccbee9 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -15,9 +15,13 @@ def gfa1_to_exons(fasta_out, gfa1_in): ], columns=["RecordType", "Name", "Sequence"], ) + + if segments.shape[0] == 0: + return + segments["fasta"] = ">" + segments["Name"] + "\n" + segments["Sequence"] segments.fasta.values.tofile(fasta, sep="\n", format="%s") - fasta.write("\n") # End line + fasta.write("\n") # Final end line def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): @@ -33,6 +37,9 @@ def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): for x in gfa.readlines() if x[0] in set(["S", "P"]) ] + if not data: + return + # Create {node_id: nucleotide} node2sequence = pd.DataFrame( data=[x[0:3] for x in data if x[0] == "S"], @@ -66,4 +73,4 @@ def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): # Dump everything paths.fasta.values.tofile(fasta, sep="\n", format="%s") - fasta.write("\n") # End line + fasta.write("\n") # Final end line diff --git a/tests/io/exons_complex.fa b/tests/io/exons_complex.fa index 812294a..706a2eb 100644 --- a/tests/io/exons_complex.fa +++ b/tests/io/exons_complex.fa @@ -1,51 +1,30 @@ >ENSDART00000161035.1:0-326 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA >ENSDART00000161035.1:397-472 -AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGT -CAGTCCAAATCAACA +AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA >ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1:5-127 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -AG +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG >ENSDART00000165342.1:125-304 -AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGG -CAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACC -CCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA >ENSDART00000165342.1:317-460 -GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGA -GGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGG -ACAAGGAGACAAGAAGTTACCAG +GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAG >ENSDART00000165342.1:459-592 -GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGC -TCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTT -CTGGGGTTCTTAA +GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA >ENSDART00000165342.1:591-650 AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT >ENSDART00000165342.1:645-746 -GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACC -ATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT >ENSDART00000165342.1:746-851 -GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGC -CTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA >ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA >ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA >ENSDART00000165342.1:974-1097 -CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA -GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGC -TGA +CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA >ENSDART00000165342.1:1098-1175 -TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTG -ACGCTCACATATCCTGA +TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA >ENSDART00000165342.1:1176-1324 -CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGAC -CCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCT -GCCCTGCAACTGGCAAAACACCACAGCG +CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/exons_complex_hard.fa b/tests/io/exons_complex_hard.fa index 7e3daba..2e50393 100644 --- a/tests/io/exons_complex_hard.fa +++ b/tests/io/exons_complex_hard.fa @@ -1,51 +1,30 @@ >ENSDART00000161035.1:0-326 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA >ENSDART00000161035.1:397-472 -AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGT -CAGTCCAAATCAACA +AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA >ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1:5-127 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -NN +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN >ENSDART00000165342.1:125-304 -NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGG -CAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACC -CCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA >ENSDART00000165342.1:317-460 -GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGA -GGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGG -ACAAGGAGACAAGAAGTTACCAN +GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN >ENSDART00000165342.1:459-592 -NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGC -TCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTT -CTGGGGTTCTTAN +NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN >ENSDART00000165342.1:591-650 NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN >ENSDART00000165342.1:645-746 -NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACC -ATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT >ENSDART00000165342.1:746-851 -GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGC -CTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA >ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA >ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA >ENSDART00000165342.1:974-1097 -CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA -GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGC -TGA +CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA >ENSDART00000165342.1:1098-1175 -TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTG -ACGCTCACATATCCTGA +TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA >ENSDART00000165342.1:1176-1324 -CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGAC -CCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCT -GCCCTGCAACTGGCAAAACACCACAGCG +CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/exons_complex_soft.fa b/tests/io/exons_complex_soft.fa index 634bb51..79b5c10 100644 --- a/tests/io/exons_complex_soft.fa +++ b/tests/io/exons_complex_soft.fa @@ -1,51 +1,30 @@ >ENSDART00000161035.1:0-326 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA >ENSDART00000161035.1:397-472 -AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGT -CAGTCCAAATCAACA +AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA >ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1:5-127 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -ag +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag >ENSDART00000165342.1:125-304 -agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGG -CAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACC -CCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA >ENSDART00000165342.1:317-460 -GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGA -GGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGG -ACAAGGAGACAAGAAGTTACCAg +GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg >ENSDART00000165342.1:459-592 -gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGC -TCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTT -CTGGGGTTCTTAa +gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa >ENSDART00000165342.1:591-650 aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct >ENSDART00000165342.1:645-746 -gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACC -ATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT >ENSDART00000165342.1:746-851 -GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGC -CTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA >ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA >ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA >ENSDART00000165342.1:974-1097 -CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA -GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGC -TGA +CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA >ENSDART00000165342.1:1098-1175 -TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTG -ACGCTCACATATCCTGA +TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA >ENSDART00000165342.1:1176-1324 -CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGAC -CCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCT -GCCCTGCAACTGGCAAAACACCACAGCG +CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/exons_simple.fa b/tests/io/exons_simple.fa index a2665a8..00a4fb2 100644 --- a/tests/io/exons_simple.fa +++ b/tests/io/exons_simple.fa @@ -1,7 +1,2 @@ >ENSDART00000161035.1:0-326 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA diff --git a/tests/io/fasta.py b/tests/io/fasta.py new file mode 100644 index 0000000..6b668d1 --- /dev/null +++ b/tests/io/fasta.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +"""tests.io.fasta.py: variables for fasta files""" + +EXONS_EMPTY_FN = "tests/io/exons_empty.fa" +EXONS_SIMPLE_FN = "tests/io/exons_simple.fa" +EXONS_COMPLEX_FN = "tests/io/exons_complex.fa" +EXONS_COMPLEX_SOFT_FN = "tests/io/exons_complex_soft.fa" +EXONS_COMPLEX_HARD_FN = "tests/io/exons_complex_hard.fa" + +GAPPED_EMPTY_FN = "tests/io/gapped_empty.fa" +GAPPED_SIMPLE_FN = "tests/io/gapped_simple.fa" +GAPPED_COMPLEX_FN = "tests/io/gapped_complex.fa" +GAPPED_COMPLEX_SOFT_FN = "tests/io/gapped_complex_soft.fa" +GAPPED_COMPLEX_HARD_FN = "tests/io/gapped_complex_hard.fa" diff --git a/tests/io/gapped_complex.fa b/tests/io/gapped_complex.fa index ce2bc82..f62db30 100644 --- a/tests/io/gapped_complex.fa +++ b/tests/io/gapped_complex.fa @@ -1,53 +1,4 @@ >ENSDART00000161035.1 ENSDART00000161035.1:0-326,ENSDART00000161035.1:397-472,ENSDART00000161035.1:477-523 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCT -GCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1 ENSDART00000165342.1:5-127,ENSDART00000165342.1:125-304,ENSDART00000165342.1:317-460,ENSDART00000165342.1:459-592,ENSDART00000165342.1:591-650,ENSDART00000165342.1:645-746,ENSDART00000165342.1:746-851,ENSDART00000165342.1:854-886,ENSDART00000165342.1:899-953,ENSDART00000165342.1:974-1097,ENSDART00000165342.1:1098-1175,ENSDART00000165342.1:1176-1324 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -AGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGACCTGTAGTAGAAAC -AAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAA -TAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACC -ACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGA -GTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACAT -CTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAGNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNGTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTT -GGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTG -CTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAANNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNAGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGG -ATCAAGTAGCTGCCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCC -TTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCT -TTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATT -GCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCC -CANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAA -CTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGA -CTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAAC -CCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTA -ACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCG -CTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACA -CACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTA -ACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAACTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/gapped_complex_soft.fa b/tests/io/gapped_complex_soft.fa index 9058743..c636bab 100644 --- a/tests/io/gapped_complex_soft.fa +++ b/tests/io/gapped_complex_soft.fa @@ -1,53 +1,4 @@ >ENSDART00000161035.1 ENSDART00000161035.1:0-326,ENSDART00000161035.1:397-472,ENSDART00000161035.1:477-523 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCT -GCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1 ENSDART00000165342.1:5-127,ENSDART00000165342.1:125-304,ENSDART00000165342.1:317-460,ENSDART00000165342.1:459-592,ENSDART00000165342.1:591-650,ENSDART00000165342.1:645-746,ENSDART00000165342.1:746-851,ENSDART00000165342.1:854-886,ENSDART00000165342.1:899-953,ENSDART00000165342.1:974-1097,ENSDART00000165342.1:1098-1175,ENSDART00000165342.1:1176-1324 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -agNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNagGACCTGTAGTAGAAAC -AAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAA -TAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACC -ACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGA -GTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACAT -CTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAgNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNgTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTT -GGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTG -CTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAaNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNaGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGG -ATCAAGTAGCTgccctNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNgccc -tTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCT -TTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATT -GCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCC -CANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAA -CTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGA -CTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAAC -CCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTA -ACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCG -CTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACA -CACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTA -ACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAagNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNagGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAgNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNgTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNaGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccctNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNgccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAACTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/io/gapped_simple.fa b/tests/io/gapped_simple.fa index 8aed86a..2f02d57 100644 --- a/tests/io/gapped_simple.fa +++ b/tests/io/gapped_simple.fa @@ -1,7 +1,2 @@ >ENSDART00000161035.1 ENSDART00000161035.1:0-326 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA diff --git a/tests/test_io/test_bed4_to_gfa1.py b/tests/test_io/test_bed4_to_gfa1.py index ecc193d..0e0359c 100644 --- a/tests/test_io/test_bed4_to_gfa1.py +++ b/tests/test_io/test_bed4_to_gfa1.py @@ -173,5 +173,7 @@ def test_complex(self): self.assertTrue(filecmp.cmp(tmp_file, GFA1_COMPLEX_FN)) os.remove(tmp_file) + + if __name__ == '__main__': main() diff --git a/tests/test_io/test_gfa1_to_fasta.py b/tests/test_io/test_gfa1_to_fasta.py new file mode 100644 index 0000000..6f06459 --- /dev/null +++ b/tests/test_io/test_gfa1_to_fasta.py @@ -0,0 +1,98 @@ +#!/usr/bin/python3 + +"""tests.test_io.test_gfa1_to_fasta.py: tests for exfi.io.gfa1_to_fasta.py""" + +from unittest import TestCase, main + +from tempfile import mkstemp +import os +import filecmp + +from exfi.io.gfa1_to_fasta import \ + gfa1_to_exons, gfa1_to_gapped_transcripts + + +from tests.io.gfa1 import \ + GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN + +from tests.io.fasta import \ + EXONS_EMPTY_FN, EXONS_SIMPLE_FN, EXONS_COMPLEX_FN, \ + GAPPED_EMPTY_FN, GAPPED_SIMPLE_FN, GAPPED_COMPLEX_FN + + +class TestGFA1ToExons(TestCase): + """Tests for exfi.io.gfa1_to_fasta.gfa1_to_exons""" + + def test_empty(self): + """exfi.io.gfa1_to_fasta.gfa1_to_exons: empty case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_exons(fasta_out=tmp_file, gfa1_in=GFA1_EMPTY_FN) + self.assertTrue(filecmp.cmp(tmp_file, EXONS_EMPTY_FN)) + os.remove(tmp_file) + + def test_simple(self): + """exfi.io.gfa1_to_fasta.gfa1_to_exons: simple case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_exons(fasta_out=tmp_file, gfa1_in=GFA1_SIMPLE_FN) + self.assertTrue(filecmp.cmp(tmp_file, EXONS_SIMPLE_FN)) + os.remove(tmp_file) + + def test_complex(self): + """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_exons(fasta_out=tmp_file, gfa1_in=GFA1_COMPLEX_FN) + self.assertTrue(filecmp.cmp(tmp_file, EXONS_COMPLEX_FN)) + os.remove(tmp_file) + + # def test_soft_masking(self): + # """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and soft masking case""" + # pass + # + # def test_complex(self): + # """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and hard masking case""" + # pass + + + +class TestGFA1ToGappedTranscripts(TestCase): + """Tests for exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts""" + + def test_empty(self): + """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: empty case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_gapped_transcripts(fasta_out=tmp_file, gfa1_in=GFA1_EMPTY_FN) + self.assertTrue(filecmp.cmp(tmp_file, GAPPED_EMPTY_FN)) + os.remove(tmp_file) + + def test_simple(self): + """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: simple case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_gapped_transcripts(fasta_out=tmp_file, gfa1_in=GFA1_SIMPLE_FN) + self.assertTrue(filecmp.cmp(tmp_file, GAPPED_SIMPLE_FN)) + os.remove(tmp_file) + + def test_complex(self): + """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_gapped_transcripts(fasta_out=tmp_file, gfa1_in=GFA1_COMPLEX_FN) + self.assertTrue(filecmp.cmp(tmp_file, GAPPED_COMPLEX_FN)) + os.remove(tmp_file) + + # def test_soft_masking(self): + # """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and soft masking case""" + # pass + # + # def test_complex(self): + # """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and hard masking case""" + # pass + + + +if __name__ == '__main__': + main() From 696f8c0e0c75e04eb95dba0a9ce2807f971b8c81 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Tue, 15 Jan 2019 18:41:17 +0100 Subject: [PATCH 25/45] Tests for exfi.io.read_bed3 --- exfi/io/{read_bed3.py => read_bed.py} | 12 +++++---- tests/io/bed.py | 5 ++++ tests/test_io/test_read_bed.py | 35 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) rename exfi/io/{read_bed3.py => read_bed.py} (58%) create mode 100644 tests/test_io/test_read_bed.py diff --git a/exfi/io/read_bed3.py b/exfi/io/read_bed.py similarity index 58% rename from exfi/io/read_bed3.py rename to exfi/io/read_bed.py index e60ee9b..9c7d097 100644 --- a/exfi/io/read_bed3.py +++ b/exfi/io/read_bed.py @@ -1,18 +1,20 @@ #!/usr/bin/env python3 -"""exfi.io.read_bed3.py: BED3 importer""" +"""exfi.io.read_bed.py: BED importer""" + +import pandas as pd +import numpy as np def read_bed3(filename): """Read a BED file and return the BED3 dataframe.""" - import pandas as pd - import numpy as np - bed3 = pd.read_table( + bed3 = pd.read_csv( filepath_or_buffer=filename, header=None, + sep='\t', usecols=[0, 1, 2], names=["chrom", "chromStart", "chromEnd"], - dtype={"chrom": np.str, "chromStart": np.int, "chromEnd": np.int}, + dtype={"chrom": np.str, "chromStart": np.int64, "chromEnd": np.int64}, engine='c' ) return bed3 diff --git a/tests/io/bed.py b/tests/io/bed.py index c3f6445..3cca097 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -5,6 +5,11 @@ import pandas as pd import numpy as np +BED3_EMPTY_FN = "tests/io/empty.bed" +BED3_SIMPLE_FN = "tests/io/simple.bed" +BED3_COMPLEX_FN = "tests/io/complex.bed" + + BED3_EMPTY = pd.DataFrame( data=None, columns=["chrom", "chromStart", "chromEnd"] diff --git a/tests/test_io/test_read_bed.py b/tests/test_io/test_read_bed.py new file mode 100644 index 0000000..260f859 --- /dev/null +++ b/tests/test_io/test_read_bed.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +"""tests.test_io.test_read_bed.py: tests for exfi.io.read_bed.py""" + +from unittest import TestCase, main + +from exfi.io.read_bed import read_bed3 + +from tests.io.bed import \ + BED3_EMPTY_FN, BED3_SIMPLE_FN, BED3_COMPLEX_FN, \ + BED3_EMPTY, BED3_SIMPLE, BED3_COMPLEX + + +class TestReadBed3(TestCase): + """Tests for exfi.io.read_bed.read_bed3""" + + def test_empty(self): + """exfi.io.read_bed.read_bed3: empty case""" + observed = read_bed3(filename=BED3_EMPTY_FN) + self.assertTrue(observed.equals(BED3_EMPTY)) + + def test_simple(self): + """exfi.io.read_bed.read_bed3: simple case""" + observed = read_bed3(filename=BED3_SIMPLE_FN) + self.assertTrue(observed.equals(BED3_SIMPLE)) + + def test_complex(self): + """exfi.io.read_bed.read_bed3: complex case""" + observed = read_bed3(filename=BED3_COMPLEX_FN) + self.assertTrue(observed.equals(BED3_COMPLEX)) + + + +if __name__ == '__main__': + main() From 36556ef231d1d77ff9d9c5166a98fc6ad6c45279 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 16 Jan 2019 15:00:45 +0100 Subject: [PATCH 26/45] exfi.io.gff3_to_bed.py + tests and files --- exfi/io/gff3_to_bed.py | 107 +++++++++++++++++++++++ tests/io/bed.py | 138 ++++++++++++++++++++++++++++++ tests/io/empty.gff3 | 0 tests/io/ensembl.gff3 | 100 ++++++++++++++++++++++ tests/io/gff3.py | 8 ++ tests/io/gmap.gff3 | 111 ++++++++++++++++++++++++ tests/test_io/test_gff3_to_bed.py | 47 ++++++++++ 7 files changed, 511 insertions(+) create mode 100644 exfi/io/gff3_to_bed.py create mode 100644 tests/io/empty.gff3 create mode 100644 tests/io/ensembl.gff3 create mode 100644 tests/io/gff3.py create mode 100644 tests/io/gmap.gff3 create mode 100644 tests/test_io/test_gff3_to_bed.py diff --git a/exfi/io/gff3_to_bed.py b/exfi/io/gff3_to_bed.py new file mode 100644 index 0000000..48faaa3 --- /dev/null +++ b/exfi/io/gff3_to_bed.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +"""exfi.io.gff3_to_bed.py: exfi submodule to convert a gff3 to bed3 where +coordinates are with respect to the transcriptome""" + +import sys + +import pandas as pd +import numpy as np + +def gff3_to_bed3(gff3_in, mode="ensembl"): + """Read a GFF3 file and convert it to BED3, where coordinates are with + respect to the transcriptome + + Modes available: + - "ensembl": for files downloaded from Ensembl, + - "gmap": for GFF3 files generated from GMAP, + - "ncbi": for GFF3 files downloaded from NCBI Genomes + """ + + gff3_columns = [ + "seqid", "source", "type", "start", "end", "score", "strand", "phase", + "attributes" + ] + + bed3_columns = ["chrom", "chromStart", "chromEnd"] + bed3_dtypes = { + "chrom": np.str, "chromStart": np.int64, "chromEnd": np.int64 + } + + raw = pd.read_csv( + sep='\t', + na_values=".", + usecols=["type", "start", "end", "strand", "attributes"], + filepath_or_buffer=gff3_in, + comment="#", + header=None, + names=gff3_columns, + low_memory=False # Convert types at the end. Seqid is char, not int + ) + + if raw.shape[0] == 0: + exons = pd.DataFrame(columns=bed3_columns) + exons = exons.astype(bed3_dtypes) + return exons + + if mode == "gmap": + exons = raw[raw['type'] == 'cDNA_match'].drop(columns='type') + exons['transcript_id'] = exons['attributes']\ + .str.split(";").str[1]\ + .str.extract(r'Name=([\w\d.-_]+)') + elif mode == "ensembl": + exons = raw[raw['type'] == 'exon'].drop(columns='type') + exons["transcript_id"] = exons["attributes"]\ + .str.split(";", 1, ).str[0]\ + .str.extract(r'Parent=transcript:([\w\d.-_]+)') + else: + sys.exit("Unknown mode") + + + if exons.shape[0] == 0: + return exons + + exons = exons[['transcript_id', 'strand', 'start', 'end']] + + positive = ( + exons + [exons['strand'] == '+'] + .drop(columns='strand') + .sort_values(by=['transcript_id', 'start', 'end']) + ) + + negative = ( + exons + [exons['strand'] == '-'] + .drop(columns='strand') + .sort_values( + by=['transcript_id', 'start', 'end'], + ascending=[True, False, False] + ) + ) + + merged = pd.concat([positive, negative]) + + merged['length'] = merged['end'] - merged['start'] + 1 + merged['transcript_end'] = ( + merged + .groupby('transcript_id') + ['transcript_id', 'length'] + .cumsum() + ) + + merged['transcript_start'] = merged['transcript_end'] - merged['length'] + + merged = merged[['transcript_id', 'transcript_start', 'transcript_end']] + + merged = merged.rename(columns={ + 'transcript_id': 'chrom', + 'transcript_start': 'chromStart', + 'transcript_end': 'chromEnd' + }) + + merged = merged.astype(bed3_dtypes) + + merged = merged.reset_index(drop=True) + + return merged diff --git a/tests/io/bed.py b/tests/io/bed.py index 3cca097..0cdeb56 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -255,3 +255,141 @@ columns=["u", "v", "overlap"] ) EDGE2OVERLAP_COMPLEX = EDGE2OVERLAP_COMPLEX.astype({"overlap": np.int64}) + + + +BED3_ENSEMBL = pd.DataFrame( + data=[ + ['ENSDART00000161842', 0, 43], + ['ENSDART00000161842', 43, 192], + ['ENSDART00000161842', 192, 351], + ['ENSDART00000161842', 351, 417], + ['ENSDART00000161842', 417, 531], + ['ENSDART00000161842', 531, 695], + ['ENSDART00000161842', 695, 722], + ['ENSDART00000165461', 0, 236], + ['ENSDART00000165461', 236, 347], + ['ENSDART00000166393', 0, 213], + ['ENSDART00000166393', 213, 362], + ['ENSDART00000166393', 362, 521], + ['ENSDART00000166393', 521, 623], + ['ENSDART00000166393', 623, 737], + ['ENSDART00000166393', 737, 901], + ['ENSDART00000166393', 901, 1060], + ['ENSDART00000166393', 1060, 1173], + ['ENSDART00000166393', 1173, 1612], + ['ENSDART00000170165', 0, 68], + ['ENSDART00000170165', 68, 217], + ['ENSDART00000170165', 217, 398], + ['ENSDART00000170165', 398, 595], + ['ENSDART00000170165', 595, 759], + ['ENSDART00000170165', 759, 918], + ['ENSDART00000170165', 918, 1031], + ['ENSDART00000170165', 1031, 1470], + ['ENSDART00000170877', 0, 141], + ['ENSDART00000170877', 141, 290], + ['ENSDART00000170877', 290, 449], + ['ENSDART00000170877', 449, 551], + ['ENSDART00000170877', 551, 590], + ['ENSDART00000171631', 0, 90], + ['ENSDART00000157701', 0, 362], + ['ENSDART00000157701', 362, 426], + ['ENSDART00000157701', 426, 545], + ['ENSDART00000158290', 0, 444], + ['ENSDART00000158290', 444, 563], + ['ENSDART00000164359', 0, 277], + ['ENSDART00000164359', 277, 353], + ['ENSDART00000164359', 353, 464], + ['ENSDART00000164359', 464, 601], + ['ENSDART00000164359', 601, 665], + ['ENSDART00000164359', 665, 1018], + ['ENSDART00000167898', 0, 176], + ['ENSDART00000167898', 176, 287], + ['ENSDART00000167898', 287, 424], + ['ENSDART00000167898', 424, 488], + ['ENSDART00000167898', 488, 605] + ], + columns=["chrom", "chromStart", "chromEnd"] +) +BED3_NCBI = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd"]) +BED3_GMAP = pd.DataFrame( + data=[ + ['ENSDART00000171570', 0, 61], + ['ENSDART00000171570', 61, 284], + ['ENSDART00000171570', 284, 462], + ['ENSDART00000171570', 462, 571], + ['ENSDART00000171570', 571, 673], + ['ENSDART00000171570', 673, 766], + ['ENSDART00000171570', 766, 934], + ['ENSDART00000171570', 934, 1024], + ['ENSDART00000171570', 1024, 1133], + ['ENSDART00000171570', 1133, 1289], + ['ENSDART00000171570', 1289, 1371], + ['ENSDART00000157830', 0, 90], + ['ENSDART00000157830', 90, 431], + ['ENSDART00000158772', 0, 49], + ['ENSDART00000158772', 49, 348], + ['ENSDART00000158814', 0, 64], + ['ENSDART00000158814', 64, 396], + ['ENSDART00000159795', 0, 46], + ['ENSDART00000159795', 46, 419], + ['ENSDART00000160202', 0, 30], + ['ENSDART00000160202', 30, 378], + ['ENSDART00000160762', 0, 43], + ['ENSDART00000160762', 43, 345], + ['ENSDART00000160996', 0, 40], + ['ENSDART00000160996', 40, 369], + ['ENSDART00000161368', 0, 43], + ['ENSDART00000161368', 43, 354], + ['ENSDART00000161463', 0, 43], + ['ENSDART00000161463', 43, 417], + ['ENSDART00000162456', 0, 43], + ['ENSDART00000162456', 43, 366], + ['ENSDART00000163675', 0, 43], + ['ENSDART00000163675', 43, 339], + ['ENSDART00000163851', 0, 40], + ['ENSDART00000163851', 40, 330], + ['ENSDART00000164110', 0, 67], + ['ENSDART00000164110', 67, 399], + ['ENSDART00000164309', 0, 46], + ['ENSDART00000164309', 46, 390], + ['ENSDART00000164489', 0, 40], + ['ENSDART00000164489', 40, 350], + ['ENSDART00000164491', 0, 43], + ['ENSDART00000164491', 43, 366], + ['ENSDART00000165410', 0, 52], + ['ENSDART00000165410', 52, 350], + ['ENSDART00000166029', 0, 61], + ['ENSDART00000166029', 61, 381], + ['ENSDART00000166882', 0, 43], + ['ENSDART00000166882', 43, 420], + ['ENSDART00000166892', 0, 332], + ['ENSDART00000167404', 0, 40], + ['ENSDART00000167404', 40, 357], + ['ENSDART00000167409', 0, 43], + ['ENSDART00000167409', 43, 420], + ['ENSDART00000167805', 0, 40], + ['ENSDART00000167805', 40, 378], + ['ENSDART00000168039', 0, 90], + ['ENSDART00000168039', 90, 470], + ['ENSDART00000170399', 0, 70], + ['ENSDART00000170399', 70, 380], + ['ENSDART00000170523', 0, 55], + ['ENSDART00000170804', 0, 43], + ['ENSDART00000170804', 43, 366], + ['ENSDART00000171020', 0, 40], + ['ENSDART00000171020', 40, 383], + ['ENSDART00000171201', 0, 40], + ['ENSDART00000171201', 40, 397], + ['ENSDART00000171344', 0, 106], + ['ENSDART00000171344', 106, 462], + ['ENSDART00000171772', 0, 90], + ['ENSDART00000171772', 90, 483], + ['ENSDART00000172037', 0, 43], + ['ENSDART00000172037', 43, 344], + ['ENSDART00000172182', 0, 61], + ['ENSDART00000172182', 61, 413], + ['ENSDART00000172374', 0, 355] + ], + columns=["chrom", "chromStart", "chromEnd"] +) diff --git a/tests/io/empty.gff3 b/tests/io/empty.gff3 new file mode 100644 index 0000000..e69de29 diff --git a/tests/io/ensembl.gff3 b/tests/io/ensembl.gff3 new file mode 100644 index 0000000..08628b7 --- /dev/null +++ b/tests/io/ensembl.gff3 @@ -0,0 +1,100 @@ +1 Ensembl chromosome 1 58871917 . . . ID=chromosome:1;Alias=CM002885.1,NC_007112.6 +1 NCBI biological_region 84 678 . + . external_name=ZFOS-1597F10 (start);logic_name=zebrafish_zfos_clones +1 NCBI biological_region 3991 4793 . + . external_name=CH1073-580J23 (start);logic_name=zebrafish_ch1073_clones +1 ensembl_havana exon 6408 6760 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001184741;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSDARE00001184741;rank=6;version=1 +1 ensembl_havana gene 6408 12027 . - . ID=gene:ENSDARG00000099104;Name=rpl24;biotype=protein_coding;description=ribosomal protein L24 [Source:ZFIN%3BAcc:ZDB-GENE-020419-25];gene_id=ENSDARG00000099104;logic_name=ensembl_havana_gene;version=1 +1 ensembl_havana mRNA 6408 12027 . - . ID=transcript:ENSDART00000164359;Parent=gene:ENSDARG00000099104;Name=rpl24-203;biotype=protein_coding;transcript_id=ENSDART00000164359;version=1 +1 ensembl_havana three_prime_UTR 6408 6679 . - . Parent=transcript:ENSDART00000164359 +1 havana exon 6642 6760 . - . Parent=transcript:ENSDART00000157701;Name=ENSDARE00001177024;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001177024;rank=3;version=1 +1 havana exon 6642 6760 . - . Parent=transcript:ENSDART00000158290;Name=ENSDARE00001177024;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001177024;rank=2;version=1 +1 havana lnc_RNA 6642 7335 . - . ID=transcript:ENSDART00000158290;Parent=gene:ENSDARG00000099104;Name=rpl24-202;biotype=retained_intron;transcript_id=ENSDART00000158290;version=1 +1 havana lnc_RNA 6642 9919 . - . ID=transcript:ENSDART00000157701;Parent=gene:ENSDARG00000099104;Name=rpl24-201;biotype=retained_intron;transcript_id=ENSDART00000157701;version=1 +1 havana exon 6644 6760 . - . Parent=transcript:ENSDART00000167898;Name=ENSDARE00001205150;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001205150;rank=5;version=1 +1 havana lnc_RNA 6644 11725 . - . ID=transcript:ENSDART00000167898;Parent=gene:ENSDARG00000099104;Name=rpl24-204;biotype=retained_intron;transcript_id=ENSDART00000167898;version=1 +1 ensembl_havana CDS 6680 6760 . - 0 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 NCBI biological_region 6708 6905 . + . external_name=ZFOS-1318C1 (start);logic_name=zebrafish_zfos_clones +1 ensembl_havana CDS 6892 6955 . - 1 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 ensembl_havana exon 6892 6955 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001160232;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSDARE00001160232;rank=5;version=1 +1 havana exon 6892 6955 . - . Parent=transcript:ENSDART00000157701;Name=ENSDARE00001217444;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001217444;rank=2;version=1 +1 havana exon 6892 6955 . - . Parent=transcript:ENSDART00000167898;Name=ENSDARE00001217444;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001217444;rank=4;version=1 +1 havana exon 6892 7335 . - . Parent=transcript:ENSDART00000158290;Name=ENSDARE00001194677;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001194677;rank=1;version=1 +1 ensembl_havana CDS 9558 9694 . - 0 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 ensembl_havana exon 9558 9694 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001216183;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSDARE00001216183;rank=4;version=1 +1 havana exon 9558 9694 . - . Parent=transcript:ENSDART00000167898;Name=ENSDARE00001212007;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001212007;rank=3;version=1 +1 havana exon 9558 9919 . - . Parent=transcript:ENSDART00000157701;Name=ENSDARE00001156719;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001156719;rank=1;version=1 +1 ensembl_havana CDS 10081 10191 . - 0 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 ensembl_havana exon 10081 10191 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001213161;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001213161;rank=3;version=1 +1 havana exon 10081 10191 . - . Parent=transcript:ENSDART00000167898;Name=ENSDARE00001224135;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001224135;rank=2;version=1 +1 ensembl_havana CDS 11550 11625 . - 1 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 ensembl_havana exon 11550 11625 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001167634;constitutive=0;ensembl_end_phase=0;ensembl_phase=2;exon_id=ENSDARE00001167634;rank=2;version=1 +1 havana exon 11550 11725 . - . Parent=transcript:ENSDART00000167898;Name=ENSDARE00001210800;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001210800;rank=1;version=1 +1 NCBI biological_region 11750 12468 . + . external_name=CH1073-287H4 (end);logic_name=zebrafish_ch1073_clones +1 ensembl_havana CDS 11751 11755 . - 0 ID=CDS:ENSDARP00000135555;Parent=transcript:ENSDART00000164359;protein_id=ENSDARP00000135555 +1 ensembl_havana exon 11751 12027 . - . Parent=transcript:ENSDART00000164359;Name=ENSDARE00001207605;constitutive=0;ensembl_end_phase=2;ensembl_phase=-1;exon_id=ENSDARE00001207605;rank=1;version=1 +1 ensembl_havana five_prime_UTR 11756 12027 . - . Parent=transcript:ENSDART00000164359 +1 ensembl_havana exon 11822 12034 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001204042;constitutive=0;ensembl_end_phase=1;ensembl_phase=-1;exon_id=ENSDARE00001204042;rank=1;version=1 +1 ensembl_havana five_prime_UTR 11822 11997 . + . Parent=transcript:ENSDART00000166393 +1 ensembl_havana gene 11822 16373 . + . ID=gene:ENSDARG00000102407;Name=cep97;biotype=protein_coding;description=centrosomal protein 97 [Source:ZFIN%3BAcc:ZDB-GENE-031030-11];gene_id=ENSDARG00000102407;logic_name=ensembl_havana_gene;version=1 +1 ensembl_havana mRNA 11822 16373 . + . ID=transcript:ENSDART00000166393;Parent=gene:ENSDARG00000102407;Name=cep97-203;biotype=protein_coding;transcript_id=ENSDART00000166393;version=1 +1 . biological_region 11954 12953 0.909 + . external_name=rank %3D 1;logic_name=firstef +1 ensembl exon 11967 12034 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001149918;constitutive=0;ensembl_end_phase=1;ensembl_phase=-1;exon_id=ENSDARE00001149918;rank=1;version=1 +1 ensembl five_prime_UTR 11967 11976 . + . Parent=transcript:ENSDART00000170165 +1 ensembl mRNA 11967 16373 . + . ID=transcript:ENSDART00000170165;Parent=gene:ENSDARG00000102407;Name=cep97-205;biotype=protein_coding;transcript_id=ENSDART00000170165;version=1 +1 ensembl CDS 11977 12034 . + 0 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 havana CDS 11992 12034 . + 0 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 11992 12034 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001179296;constitutive=0;ensembl_end_phase=1;ensembl_phase=0;exon_id=ENSDARE00001179296;rank=1;version=1 +1 havana mRNA 11992 14058 . + . ID=transcript:ENSDART00000161842;Parent=gene:ENSDARG00000102407;Name=cep97-201;biotype=protein_coding;transcript_id=ENSDART00000161842;version=1 +1 ensembl_havana CDS 11998 12034 . + 0 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 havana exon 12129 12269 . + . Parent=transcript:ENSDART00000170877;Name=ENSDARE00001158433;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001158433;rank=1;version=1 +1 havana lnc_RNA 12129 13257 . + . ID=transcript:ENSDART00000170877;Parent=gene:ENSDARG00000102407;Name=cep97-206;biotype=processed_transcript;transcript_id=ENSDART00000170877;version=1 +1 ensembl CDS 12373 12521 . + 2 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 12373 12521 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001157163;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSDARE00001157163;rank=2;version=1 +1 ensembl_havana CDS 12373 12521 . + 2 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 12373 12521 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001157163;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSDARE00001157163;rank=2;version=1 +1 havana CDS 12373 12521 . + 2 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 12373 12521 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001157163;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSDARE00001157163;rank=2;version=1 +1 havana exon 12373 12521 . + . Parent=transcript:ENSDART00000170877;Name=ENSDARE00001206566;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001206566;rank=2;version=1 +1 ensembl CDS 12795 12975 . + 0 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 12795 12975 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001173294;constitutive=0;ensembl_end_phase=1;ensembl_phase=0;exon_id=ENSDARE00001173294;rank=3;version=1 +1 ensembl_havana CDS 12795 12953 . + 0 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 12795 12953 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001202073;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001202073;rank=3;version=1 +1 havana CDS 12795 12953 . + 0 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 12795 12953 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001202073;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001202073;rank=3;version=1 +1 havana exon 12795 12953 . + . Parent=transcript:ENSDART00000170877;Name=ENSDARE00001190383;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001190383;rank=3;version=1 +1 ensembl_havana CDS 13034 13135 . + 0 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 13034 13135 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001162094;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001162094;rank=4;version=1 +1 havana exon 13034 13135 . + . Parent=transcript:ENSDART00000170877;Name=ENSDARE00001154056;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001154056;rank=4;version=1 +1 havana CDS 13070 13135 . + 0 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 13070 13135 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001156255;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001156255;rank=4;version=1 +1 ensembl CDS 13136 13332 . + 2 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 13136 13332 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001209522;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSDARE00001209522;rank=4;version=1 +1 ensembl_havana CDS 13219 13332 . + 0 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 13219 13332 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001164645;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001164645;rank=5;version=1 +1 havana CDS 13219 13332 . + 0 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 13219 13257 . + . Parent=transcript:ENSDART00000170877;Name=ENSDARE00001227417;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001227417;rank=5;version=1 +1 havana exon 13219 13332 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001164645;constitutive=0;ensembl_end_phase=0;ensembl_phase=0;exon_id=ENSDARE00001164645;rank=5;version=1 +1 ensembl CDS 13504 13667 . + 0 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 13504 13667 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001170727;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSDARE00001170727;rank=5;version=1 +1 ensembl_havana CDS 13504 13667 . + 0 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 13504 13667 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001170727;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSDARE00001170727;rank=6;version=1 +1 havana CDS 13504 13667 . + 0 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 13504 13667 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001170727;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSDARE00001170727;rank=6;version=1 +1 ensembl CDS 14032 14190 . + 1 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 14032 14190 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001217418;constitutive=0;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSDARE00001217418;rank=6;version=1 +1 ensembl_havana CDS 14032 14190 . + 1 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 14032 14190 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001217418;constitutive=0;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSDARE00001217418;rank=7;version=1 +1 havana CDS 14032 14058 . + 1 ID=CDS:ENSDARP00000141045;Parent=transcript:ENSDART00000161842;protein_id=ENSDARP00000141045 +1 havana exon 14032 14058 . + . Parent=transcript:ENSDART00000161842;Name=ENSDARE00001213447;constitutive=0;ensembl_end_phase=2;ensembl_phase=2;exon_id=ENSDARE00001213447;rank=7;version=1 +1 havana exon 14167 14402 . + . Parent=transcript:ENSDART00000165461;Name=ENSDARE00001168799;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001168799;rank=1;version=1 +1 havana lnc_RNA 14167 14809 . + . ID=transcript:ENSDART00000165461;Parent=gene:ENSDARG00000102407;Name=cep97-202;biotype=retained_intron;transcript_id=ENSDART00000165461;version=1 +1 ensembl CDS 14290 14402 . + 1 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 14290 14402 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001201156;constitutive=0;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSDARE00001201156;rank=7;version=1 +1 ensembl_havana CDS 14290 14402 . + 1 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 14290 14402 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001201156;constitutive=0;ensembl_end_phase=1;ensembl_phase=2;exon_id=ENSDARE00001201156;rank=8;version=1 +1 havana exon 14343 14432 . + . Parent=transcript:ENSDART00000171631;Name=ENSDARE00001160785;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001160785;rank=1;version=1 +1 havana lnc_RNA 14343 15063 . + . ID=transcript:ENSDART00000171631;Parent=gene:ENSDARG00000102407;Name=cep97-207;biotype=retained_intron;transcript_id=ENSDART00000171631;version=1 +1 ensembl CDS 14699 15137 . + 2 ID=CDS:ENSDARP00000133590;Parent=transcript:ENSDART00000170165;protein_id=ENSDARP00000133590 +1 ensembl exon 14699 15137 . + . Parent=transcript:ENSDART00000170165;Name=ENSDARE00001202434;constitutive=0;ensembl_end_phase=2;ensembl_phase=1;exon_id=ENSDARE00001202434;rank=8;version=1 +1 ensembl_havana CDS 14699 15137 . + 2 ID=CDS:ENSDARP00000130314;Parent=transcript:ENSDART00000166393;protein_id=ENSDARP00000130314 +1 ensembl_havana exon 14699 15137 . + . Parent=transcript:ENSDART00000166393;Name=ENSDARE00001202434;constitutive=0;ensembl_end_phase=2;ensembl_phase=1;exon_id=ENSDARE00001202434;rank=9;version=1 +1 havana exon 14699 14809 . + . Parent=transcript:ENSDART00000165461;Name=ENSDARE00001203969;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSDARE00001203969;rank=2;version=1 diff --git a/tests/io/gff3.py b/tests/io/gff3.py new file mode 100644 index 0000000..1a98e89 --- /dev/null +++ b/tests/io/gff3.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 + +"""tests.io.gff3.py: variables for GFF3 files""" + +GFF3_EMPTY_FN = "tests/io/empty.gff3" +GFF3_ENSEMBL_FN = "tests/io/ensembl.gff3" +GFF3_NCBI_FN = "tests/io/ncbi.gff3" +GFF3_GMAP_FN = "tests/io/gmap.gff3" diff --git a/tests/io/gmap.gff3 b/tests/io/gmap.gff3 new file mode 100644 index 0000000..1a34502 --- /dev/null +++ b/tests/io/gmap.gff3 @@ -0,0 +1,111 @@ +##gff-version 3 +# Generated by GMAP version 2018-07-04 using call: gmap.sse42 --db drer --dir results/gmap/ -f gff3_match_cdna --nthreads 24 results/raw/drer.rna.fa +2 drer cDNA_match 36249168 36249216 100 - . ID=ENSDART00000158772.path1;Name=ENSDART00000158772;Target=ENSDART00000158772 1 49;Gap=M49 +2 drer cDNA_match 36248785 36249083 100 - . ID=ENSDART00000158772.path1;Name=ENSDART00000158772;Target=ENSDART00000158772 50 348;Gap=M299 +### +2 drer cDNA_match 31886285 31886336 100 - . ID=ENSDART00000165410.path1;Name=ENSDART00000165410;Target=ENSDART00000165410 1 52;Gap=M52 +2 drer cDNA_match 31885903 31886200 100 - . ID=ENSDART00000165410.path1;Name=ENSDART00000165410;Target=ENSDART00000165410 53 350;Gap=M298 +### +2 drer cDNA_match 31890210 31890252 100 - . ID=ENSDART00000163675.path1;Name=ENSDART00000163675;Target=ENSDART00000163675 1 43;Gap=M43 +2 drer cDNA_match 31889803 31890098 100 - . ID=ENSDART00000163675.path1;Name=ENSDART00000163675;Target=ENSDART00000163675 44 339;Gap=M296 +### +2 drer cDNA_match 36256421 36256463 100 - . ID=ENSDART00000161463.path1;Name=ENSDART00000161463;Target=ENSDART00000161463 1 43;Gap=M43 +2 drer cDNA_match 36246359 36246732 100 - . ID=ENSDART00000161463.path1;Name=ENSDART00000161463;Target=ENSDART00000161463 44 417;Gap=M374 +### +2 drer cDNA_match 36230947 36231036 100 - . ID=ENSDART00000171772.path1;Name=ENSDART00000171772;Target=ENSDART00000171772 1 90;Gap=M90 +2 drer cDNA_match 36230440 36230832 100 - . ID=ENSDART00000171772.path1;Name=ENSDART00000171772;Target=ENSDART00000171772 91 483;Gap=M393 +### +2 drer cDNA_match 31894253 31894607 100 - . ID=ENSDART00000172374.path1;Name=ENSDART00000172374;Target=ENSDART00000172374 1 355;Gap=M355 +### +2 drer cDNA_match 36239899 36239944 100 - . ID=ENSDART00000159795.path1;Name=ENSDART00000159795;Target=ENSDART00000159795 1 46;Gap=M46 +2 drer cDNA_match 36239366 36239738 100 - . ID=ENSDART00000159795.path1;Name=ENSDART00000159795;Target=ENSDART00000159795 47 419;Gap=M373 +### +2 drer cDNA_match 36234426 36234515 100 - . ID=ENSDART00000168039.path1;Name=ENSDART00000168039;Target=ENSDART00000168039 1 90;Gap=M90 +2 drer cDNA_match 36233940 36234319 100 - . ID=ENSDART00000168039.path1;Name=ENSDART00000168039;Target=ENSDART00000168039 91 470;Gap=M380 +### +2 drer cDNA_match 36235874 36235963 100 - . ID=ENSDART00000157830.path1;Name=ENSDART00000157830;Target=ENSDART00000157830 1 90;Gap=M90 +2 drer cDNA_match 36235418 36235758 100 - . ID=ENSDART00000157830.path1;Name=ENSDART00000157830;Target=ENSDART00000157830 91 431;Gap=M341 +### +2 drer cDNA_match 36232831 36232936 100 - . ID=ENSDART00000171344.path1;Name=ENSDART00000171344;Target=ENSDART00000171344 1 106;Gap=M106 +2 drer cDNA_match 36232374 36232729 100 - . ID=ENSDART00000171344.path1;Name=ENSDART00000171344;Target=ENSDART00000171344 107 462;Gap=M356 +### +2 drer cDNA_match 36256421 36256463 100 - . ID=ENSDART00000166882.path1;Name=ENSDART00000166882;Target=ENSDART00000166882 1 43;Gap=M43 +2 drer cDNA_match 36255879 36256255 100 - . ID=ENSDART00000166882.path1;Name=ENSDART00000166882;Target=ENSDART00000166882 44 420;Gap=M377 +### +2 drer cDNA_match 31902759 31902801 100 - . ID=ENSDART00000170804.path1;Name=ENSDART00000170804;Target=ENSDART00000170804 1 43;Gap=M43 +2 drer cDNA_match 31902363 31902685 100 - . ID=ENSDART00000170804.path1;Name=ENSDART00000170804;Target=ENSDART00000170804 44 366;Gap=M323 +### +2 drer cDNA_match 36231696 36231735 100 - . ID=ENSDART00000167404.path1;Name=ENSDART00000167404;Target=ENSDART00000167404 1 40;Gap=M40 +2 drer cDNA_match 36231258 36231574 100 - . ID=ENSDART00000167404.path1;Name=ENSDART00000167404;Target=ENSDART00000167404 41 357;Gap=M317 +### +2 drer cDNA_match 31883930 31883972 100 - . ID=ENSDART00000172037.path1;Name=ENSDART00000172037;Target=ENSDART00000172037 1 43;Gap=M43 +2 drer cDNA_match 31883504 31883804 100 - . ID=ENSDART00000172037.path1;Name=ENSDART00000172037;Target=ENSDART00000172037 44 344;Gap=M301 +### +2 drer cDNA_match 36274274 36274337 100 - . ID=ENSDART00000158814.path1;Name=ENSDART00000158814;Target=ENSDART00000158814 1 64;Gap=M64 +2 drer cDNA_match 36273855 36274186 100 - . ID=ENSDART00000158814.path1;Name=ENSDART00000158814;Target=ENSDART00000158814 65 396;Gap=M332 +### +2 drer cDNA_match 36260136 36260178 100 - . ID=ENSDART00000162456.path1;Name=ENSDART00000162456;Target=ENSDART00000162456 1 43;Gap=M43 +2 drer cDNA_match 36259665 36259987 100 - . ID=ENSDART00000162456.path1;Name=ENSDART00000162456;Target=ENSDART00000162456 44 366;Gap=M323 +### +2 drer cDNA_match 36237815 36237854 100 - . ID=ENSDART00000167805.path1;Name=ENSDART00000167805;Target=ENSDART00000167805 1 40;Gap=M40 +2 drer cDNA_match 36237130 36237467 100 - . ID=ENSDART00000167805.path1;Name=ENSDART00000167805;Target=ENSDART00000167805 41 378;Gap=M338 +### +2 drer cDNA_match 36277116 36277155 100 - . ID=ENSDART00000171201.path1;Name=ENSDART00000171201;Target=ENSDART00000171201 1 40;Gap=M40 +2 drer cDNA_match 36276434 36276790 100 - . ID=ENSDART00000171201.path1;Name=ENSDART00000171201;Target=ENSDART00000171201 41 397;Gap=M357 +### +2 drer cDNA_match 31906148 31906190 100 - . ID=ENSDART00000160762.path1;Name=ENSDART00000160762;Target=ENSDART00000160762 1 43;Gap=M43 +2 drer cDNA_match 31905757 31906058 100 - . ID=ENSDART00000160762.path1;Name=ENSDART00000160762;Target=ENSDART00000160762 44 345;Gap=M302 +### +2 drer cDNA_match 31897372 31897414 100 - . ID=ENSDART00000161368.path1;Name=ENSDART00000161368;Target=ENSDART00000161368 1 43;Gap=M43 +2 drer cDNA_match 31896976 31897286 100 - . ID=ENSDART00000161368.path1;Name=ENSDART00000161368;Target=ENSDART00000161368 44 354;Gap=M311 +### +2 drer cDNA_match 36247501 36247543 100 - . ID=ENSDART00000164491.path1;Name=ENSDART00000164491;Target=ENSDART00000164491 1 43;Gap=M43 +2 drer cDNA_match 36247072 36247394 100 - . ID=ENSDART00000164491.path1;Name=ENSDART00000164491;Target=ENSDART00000164491 44 366;Gap=M323 +### +2 drer cDNA_match 36262915 36262960 100 - . ID=ENSDART00000164309.path1;Name=ENSDART00000164309;Target=ENSDART00000164309 1 46;Gap=M46 +2 drer cDNA_match 36262453 36262796 100 - . ID=ENSDART00000164309.path1;Name=ENSDART00000164309;Target=ENSDART00000164309 47 390;Gap=M344 +### +2 drer cDNA_match 36265264 36265303 100 - . ID=ENSDART00000160996.path1;Name=ENSDART00000160996;Target=ENSDART00000160996 1 40;Gap=M40 +2 drer cDNA_match 36264616 36264944 100 - . ID=ENSDART00000160996.path1;Name=ENSDART00000160996;Target=ENSDART00000160996 41 369;Gap=M329 +### +2 drer cDNA_match 36241756 36241785 100 - . ID=ENSDART00000160202.path1;Name=ENSDART00000160202;Target=ENSDART00000160202 1 30;Gap=M30 +2 drer cDNA_match 36241302 36241649 100 - . ID=ENSDART00000160202.path1;Name=ENSDART00000160202;Target=ENSDART00000160202 31 378;Gap=M348 +### +2 drer cDNA_match 36266943 36267274 100 - . ID=ENSDART00000166892.path1;Name=ENSDART00000166892;Target=ENSDART00000166892 1 332;Gap=M332 +### +2 drer cDNA_match 36279143 36279203 100 - . ID=ENSDART00000166029.path1;Name=ENSDART00000166029;Target=ENSDART00000166029 1 61;Gap=M61 +2 drer cDNA_match 36278713 36279032 100 - . ID=ENSDART00000166029.path1;Name=ENSDART00000166029;Target=ENSDART00000166029 62 381;Gap=M320 +### +2 drer cDNA_match 36259470 36259509 100 - . ID=ENSDART00000171020.path1;Name=ENSDART00000171020;Target=ENSDART00000171020 1 40;Gap=M40 +2 drer cDNA_match 36258951 36259293 100 - . ID=ENSDART00000171020.path1;Name=ENSDART00000171020;Target=ENSDART00000171020 41 383;Gap=M343 +### +2 drer cDNA_match 36291147 36291186 100 - . ID=ENSDART00000164489.path1;Name=ENSDART00000164489;Target=ENSDART00000164489 1 40;Gap=M40 +2 drer cDNA_match 36290705 36291014 100 - . ID=ENSDART00000164489.path1;Name=ENSDART00000164489;Target=ENSDART00000164489 41 350;Gap=M310 +### +2 drer cDNA_match 36252391 36252433 100 - . ID=ENSDART00000167409.path1;Name=ENSDART00000167409;Target=ENSDART00000167409 1 43;Gap=M43 +2 drer cDNA_match 36251847 36252223 100 - . ID=ENSDART00000167409.path1;Name=ENSDART00000167409;Target=ENSDART00000167409 44 420;Gap=M377 +### +2 drer cDNA_match 36330627 36330687 100 - . ID=ENSDART00000172182.path1;Name=ENSDART00000172182;Target=ENSDART00000172182 1 61;Gap=M61 +2 drer cDNA_match 36330155 36330506 100 - . ID=ENSDART00000172182.path1;Name=ENSDART00000172182;Target=ENSDART00000172182 62 413;Gap=M352 +### +2 drer cDNA_match 36310901 36310970 100 - . ID=ENSDART00000170399.path1;Name=ENSDART00000170399;Target=ENSDART00000170399 1 70;Gap=M70 +2 drer cDNA_match 36310439 36310748 100 - . ID=ENSDART00000170399.path1;Name=ENSDART00000170399;Target=ENSDART00000170399 71 380;Gap=M310 +### +2 drer cDNA_match 36244724 36244763 100 - . ID=ENSDART00000163851.path1;Name=ENSDART00000163851;Target=ENSDART00000163851 1 40;Gap=M40 +2 drer cDNA_match 36244324 36244613 100 - . ID=ENSDART00000163851.path1;Name=ENSDART00000163851;Target=ENSDART00000163851 41 330;Gap=M290 +### +2 drer cDNA_match 36277752 36277818 100 - . ID=ENSDART00000164110.path1;Name=ENSDART00000164110;Target=ENSDART00000164110 1 67;Gap=M67 +2 drer cDNA_match 36277261 36277592 100 - . ID=ENSDART00000164110.path1;Name=ENSDART00000164110;Target=ENSDART00000164110 68 399;Gap=M332 +### +2 drer cDNA_match 36336066 36336120 100 - . ID=ENSDART00000170523.path1;Name=ENSDART00000170523;Target=ENSDART00000170523 1 55;Gap=M55 +9 drer cDNA_match 21404310 21404370 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 1 61;Gap=M61 +9 drer cDNA_match 21405940 21406162 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 62 284;Gap=M223 +9 drer cDNA_match 21406424 21406601 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 285 462;Gap=M178 +9 drer cDNA_match 21407033 21407141 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 463 571;Gap=M109 +9 drer cDNA_match 21408140 21408241 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 572 673;Gap=M102 +9 drer cDNA_match 21408611 21408703 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 674 766;Gap=M93 +9 drer cDNA_match 21408817 21408984 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 767 934;Gap=M168 +9 drer cDNA_match 21410979 21411068 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 935 1024;Gap=M90 +9 drer cDNA_match 21411847 21411955 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 1025 1133;Gap=M109 +9 drer cDNA_match 21412066 21412221 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 1134 1289;Gap=M156 +9 drer cDNA_match 21413422 21413503 100 + . ID=ENSDART00000171570.path1;Name=ENSDART00000171570;Target=ENSDART00000171570 1290 1371;Gap=M82 diff --git a/tests/test_io/test_gff3_to_bed.py b/tests/test_io/test_gff3_to_bed.py new file mode 100644 index 0000000..e166c3f --- /dev/null +++ b/tests/test_io/test_gff3_to_bed.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 + +"""tests.test_io.test_gff3_to_bed.py: tests for exfi.io.gff3_to_bed.py""" + +from unittest import TestCase, main + +from exfi.io.gff3_to_bed import gff3_to_bed3 + +from tests.io.gff3 import \ + GFF3_EMPTY_FN, GFF3_ENSEMBL_FN, GFF3_GMAP_FN + +from tests.io.bed import \ + BED3_EMPTY, BED3_ENSEMBL, BED3_GMAP + + + + +class TestGFF3ToBED(TestCase): + """Tests for exfi.io.gff3_to_bed.gff3_to_bed3""" + + def test_empty(self): + """exfi.io.gff3_to_bed.gff3_to_bed3: empty case""" + observed = gff3_to_bed3(GFF3_EMPTY_FN) + self.assertTrue(observed.equals(BED3_EMPTY)) + + def test_ensembl(self): + """exfi.io.gff3_to_bed.gff3_to_bed3: ensembl case""" + observed = gff3_to_bed3(GFF3_ENSEMBL_FN, mode="ensembl") + print("Observed", observed.values.tolist(), observed.dtypes, sep='\n') + self.assertTrue(observed.equals(BED3_ENSEMBL)) + + def test_gmap(self): + """exfi.io.gff3_to_bed.gff3_to_bed3: gmap case""" + observed = gff3_to_bed3(GFF3_GMAP_FN, mode="gmap") + print("Observed", observed.values.tolist(), observed.dtypes, sep='\n') + self.assertTrue(observed.equals(BED3_GMAP)) + + # def test_ncbi(self): + # """exfi.io.gff3_to_bed.gff3_to_bed3: ncbi case""" + # observed = gff3_to_bed3(GFF3_NCBI_FN, mode=ncbi) + # self.assertTrue(observed.equals(BED3_NCBI)) + + + + +if __name__ == '__main__': + main() From 2784f30fb357899982ba9d7bb753b9d00ffc8e6a Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 16 Jan 2019 15:06:01 +0100 Subject: [PATCH 27/45] Cleanup of splice graph files --- tests/test_io/test_components.py | 43 ---- tests/test_io/test_gfa1_to_exons.py | 86 -------- .../test_gfa1_to_gapped_transcripts.py | 86 -------- tests/test_io/test_gfa1_to_splice_graph.py | 45 ---- tests/test_io/test_read_gfa1.py | 204 ------------------ 5 files changed, 464 deletions(-) delete mode 100644 tests/test_io/test_components.py delete mode 100644 tests/test_io/test_gfa1_to_exons.py delete mode 100644 tests/test_io/test_gfa1_to_gapped_transcripts.py delete mode 100644 tests/test_io/test_gfa1_to_splice_graph.py delete mode 100644 tests/test_io/test_read_gfa1.py diff --git a/tests/test_io/test_components.py b/tests/test_io/test_components.py deleted file mode 100644 index 0525f7b..0000000 --- a/tests/test_io/test_components.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -"""Tests for exfi.io.join_components submodule""" - -from unittest import TestCase, main - -from exfi.io.components import \ - join_components, \ - split_into_components - -from tests.custom_assertions import \ - CustomAssertions - -from tests.data import \ - SPLICE_GRAPH_EMPTY, SPLICE_GRAPH_SIMPLE, SPLICE_GRAPH_COMPLEX - - - -class TestJoinComponents(TestCase, CustomAssertions): - """Tests for exfi.io.join_components.join_components""" - - def test_empty(self): - """exfi.io.join_components.join_components: empty case""" - actual = join_components(split_into_components(SPLICE_GRAPH_EMPTY)) - self.assertEqualSpliceGraphs(actual, SPLICE_GRAPH_EMPTY) - - - def test_simple(self): - """exfi.io.join_components.join_components: simple case""" - actual = join_components(split_into_components(SPLICE_GRAPH_SIMPLE)) - self.assertEqualSpliceGraphs(actual, SPLICE_GRAPH_SIMPLE) - - def test_complex(self): - """exfi.io.join_components.join_components: complex case""" - actual = join_components(split_into_components(SPLICE_GRAPH_COMPLEX)) - self.assertEqualSpliceGraphs(actual, SPLICE_GRAPH_COMPLEX) - - - - - -if __name__ == '__main__': - main() diff --git a/tests/test_io/test_gfa1_to_exons.py b/tests/test_io/test_gfa1_to_exons.py deleted file mode 100644 index 95fb096..0000000 --- a/tests/test_io/test_gfa1_to_exons.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.io.gfa1_to_exons -""" - -from unittest import TestCase, main - -import filecmp -import tempfile -import os - -from exfi.io.gfa1_to_exons import \ - gfa1_to_exons - -from exfi.io.fasta_to_dict import \ - fasta_to_dict - -from tests.data import \ - GFA_EMPTY_FN, GFA_SIMPLE_FN, GFA_COMPLEX_FN - -EXONS_EMPTY_FN = "tests/io/exons_empty.fa" -EXONS_SIMPLE_FN = "tests/io/exons_simple.fa" -EXONS_COMPLEX_FN = "tests/io/exons_complex.fa" -EXONS_COMPLEX_SOFT_FN = "tests/io/exons_complex_soft.fa" -EXONS_COMPLEX_HARD_FN = "tests/io/exons_complex_hard.fa" - -EXONS_EMPTY_DICT = fasta_to_dict(EXONS_EMPTY_FN) -EXONS_SIMPLE_DICT = fasta_to_dict(EXONS_SIMPLE_FN) -EXONS_COMPLEX_DICT = fasta_to_dict(EXONS_COMPLEX_FN) -EXONS_COMPLEX_SOFT_DICT = fasta_to_dict(EXONS_COMPLEX_SOFT_FN) -EXONS_COMPLEX_HARD_DICT = fasta_to_dict(EXONS_COMPLEX_HARD_FN) - -class TestGFA1ToExons(TestCase): - """Tests for gfa1_to_exons""" - - def test_empty(self): - """exfi.io.gfa1_to_exons.gfa1_to_exons: empty case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_exons(gfa_in_fn=GFA_EMPTY_FN, fasta_out_fn=tmp_file, masking="none") - self.assertTrue(filecmp.cmp(tmp_file, EXONS_EMPTY_FN)) - os.remove(tmp_file) - - - def test_simple(self): - """exfi.io.gfa1_to_exons.gfa1_to_exons: simple case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_exons(gfa_in_fn=GFA_SIMPLE_FN, fasta_out_fn=tmp_file) - self.assertTrue(filecmp.cmp(tmp_file, EXONS_SIMPLE_FN)) - os.remove(tmp_file) - - def test_multiple(self): - """exfi.io.gfa1_to_exons.gfa1_to_exons: complex case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_exons(gfa_in_fn=GFA_COMPLEX_FN, fasta_out_fn=tmp_file) - self.assertTrue(filecmp.cmp(tmp_file, EXONS_COMPLEX_FN)) - os.remove(tmp_file) - - def test_multiple_soft(self): - """exfi.io.gfa1_to_exons.gfa1_to_exons: complex case and soft masking""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_exons( - gfa_in_fn=GFA_COMPLEX_FN, - fasta_out_fn=tmp_file, - masking="soft" - ) - self.assertTrue( - filecmp.cmp(tmp_file, EXONS_COMPLEX_SOFT_FN) - ) - os.remove(tmp_file) - - def test_multiple_hard(self): - """exfi.io.gfa1_to_exons.gfa1_to_exons: complex case and hard masking""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_exons( - gfa_in_fn=GFA_COMPLEX_FN, - fasta_out_fn=tmp_file, - masking="hard" - ) - self.assertTrue( - filecmp.cmp(tmp_file, EXONS_COMPLEX_HARD_FN) - ) - os.remove(tmp_file) - -if __name__ == '__main__': - main() diff --git a/tests/test_io/test_gfa1_to_gapped_transcripts.py b/tests/test_io/test_gfa1_to_gapped_transcripts.py deleted file mode 100644 index 892a619..0000000 --- a/tests/test_io/test_gfa1_to_gapped_transcripts.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.io.gfa1_to_gapped_transcripts -""" - - -from unittest import TestCase, main - -import filecmp -import tempfile -import os - -from exfi.io.gfa1_to_gapped_transcripts import \ - gfa1_to_gapped_transcripts - -from tests.data import \ - GFA_EMPTY_FN, GFA_SIMPLE_FN, GFA_COMPLEX_FN - - -GAPPED_EMPTY_FN = "tests/io/gapped_empty.fa" -GAPPED_SIMPLE_FN = "tests/io/gapped_simple.fa" -GAPPED_COMPLEX_FN = "tests/io/gapped_complex.fa" -GAPPED_COMPLEX_SOFT_FN = "tests/io/gapped_complex_soft.fa" -GAPPED_COMPLEX_HARD_FN = "tests/io/gapped_complex_hard.fa" - -class TestGFA1ToGappedTranscripts(TestCase): - """Tests for gfa1_to_gapped_transcript""" - - def test_empty(self): - """exfi.io.gfa1_to_gapped_transcripts.gfa1_to_gapped_transcripts: empty case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_gapped_transcripts(gfa_in=GFA_EMPTY_FN, fasta_out=tmp_file) - self.assertTrue( - filecmp.cmp(tmp_file, GAPPED_EMPTY_FN) - ) - os.remove(tmp_file) - - def test_simple(self): - """exfi.io.gfa1_to_gapped_transcripts.gfa1_to_gapped_transcripts: simple case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_gapped_transcripts(gfa_in=GFA_SIMPLE_FN, fasta_out=tmp_file) - self.assertTrue( - filecmp.cmp(tmp_file, GAPPED_SIMPLE_FN) - ) - os.remove(tmp_file) - - def test_multiple(self): - """exfi.io.gfa1_to_gapped_transcripts.gfa1_to_gapped_transcripts: complex case""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_gapped_transcripts(gfa_in=GFA_COMPLEX_FN, fasta_out=tmp_file) - self.assertTrue(filecmp.cmp( - tmp_file, - GAPPED_COMPLEX_FN - )) - os.remove(tmp_file) - - def test_multiple_soft(self): - """exfi.io.gfa1_to_gapped_transcripts.gfa1_to_gapped_transcripts: complex case and soft - masking""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_gapped_transcripts( - gfa_in=GFA_COMPLEX_FN, - fasta_out=tmp_file, - masking="soft" - ) - self.assertTrue( - filecmp.cmp(tmp_file, GAPPED_COMPLEX_SOFT_FN) - ) - os.remove(tmp_file) - - - def test_multiple_hard(self): - """exfi.io.gfa1_to_gapped_transcripts.gfa1_to_gapped_transcripts: complex case and hard - masking""" - tmp_file = tempfile.mkstemp()[1] - gfa1_to_gapped_transcripts( - gfa_in=GFA_COMPLEX_FN, - fasta_out=tmp_file, - masking="hard" - ) - self.assertTrue(filecmp.cmp(tmp_file, GAPPED_COMPLEX_HARD_FN)) - os.remove(tmp_file) - -if __name__ == '__main__': - main() diff --git a/tests/test_io/test_gfa1_to_splice_graph.py b/tests/test_io/test_gfa1_to_splice_graph.py deleted file mode 100644 index bb1af76..0000000 --- a/tests/test_io/test_gfa1_to_splice_graph.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.io.gfa1_to_splice_graph -""" - -from unittest import TestCase, main - -from exfi.io.gfa1_to_splice_graph_dict import gfa1_to_splice_graph_dict - -from tests.custom_assertions import \ - CustomAssertions - -from tests.data import \ - SPLICE_GRAPH_EMPTY_DICT, SPLICE_GRAPH_SIMPLE_DICT, SPLICE_GRAPH_COMPLEX_DICT, \ - GFA_EMPTY_FN, GFA_SIMPLE_FN, GFA_COMPLEX_FN - - - -class TestGFA1ToSpliceGrah(TestCase, CustomAssertions): - """Tests for exfi.io.gfa1_to_splice_graph.gfa1_to_splice_graph""" - - def test_empty(self): - """exfi.io.gfa1_to_splice_graph.gfa1_to_splice_graph: empty case""" - actual = gfa1_to_splice_graph_dict(GFA_EMPTY_FN) - expected = SPLICE_GRAPH_EMPTY_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - - def test_simple(self): - """exfi.io.gfa1_to_splice_graph.gfa1_to_splice_graph: simple case""" - actual = gfa1_to_splice_graph_dict(GFA_SIMPLE_FN) - expected = SPLICE_GRAPH_SIMPLE_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - - def test_complex(self): - """exfi.io.gfa1_to_splice_graph.gfa1_to_splice_graph: complex case""" - actual = gfa1_to_splice_graph_dict(GFA_COMPLEX_FN) - expected = SPLICE_GRAPH_COMPLEX_DICT - self.assertEqualDictOfSpliceGraphs(actual, expected) - - -if __name__ == '__main__': - main() diff --git a/tests/test_io/test_read_gfa1.py b/tests/test_io/test_read_gfa1.py deleted file mode 100644 index a59687b..0000000 --- a/tests/test_io/test_read_gfa1.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python3 - -""" -Tests for exfi.io.read_gfa1 -""" - -from unittest import TestCase, main - -from exfi.io.read_gfa1 import \ - _overlap_str_to_int, \ - _process_segments, \ - _process_links, \ - _process_containments, \ - _process_paths, \ - read_gfa1 - - -from tests.data import \ - GFA_EMPTY_FN, GFA_SIMPLE_FN, GFA_COMPLEX_FN, \ - SEGMENTS_EMPTY, SEGMENTS_SIMPLE, SEGMENTS_COMPLEX, \ - SEGMENTS_EMPTY_DICT, SEGMENTS_SIMPLE_DICT, SEGMENTS_COMPLEX_DICT, \ - LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ - LINKS_EMPTY_DICT, LINKS_SIMPLE_DICT, LINKS_COMPLEX_DICT, \ - CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ - CONTAINMENTS_EMPTY_DICT, CONTAINMENTS_SIMPLE_DICT, CONTAINMENTS_COMPLEX_DICT, \ - PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX, \ - PATHS_EMPTY_DICT, PATHS_SIMPLE_DICT, PATHS_COMPLEX_DICT - - - -def _split_lines(list_of_elements): - """Process line""" - for element in list_of_elements: - yield element.strip().split("\t") - - - -class TestOverlapStrToInt(TestCase): - """Tests of exfi.io.read_gfa1._overlap_str_to_int""" - - def test_match(self): - """exfi.io.read_gfa1._overlap_str_to_int: wrong case""" - self.assertEqual(_overlap_str_to_int("13M"), 13) - - def test_gap(self): - """exfi.io.read_gfa1._overlap_str_to_int: wrong case""" - self.assertEqual(_overlap_str_to_int("12G"), -12) - - def test_missing_letter(self): - """exfi.io.read_gfa1._overlap_str_to_int: messy case""" - with self.assertRaises(ValueError): - _overlap_str_to_int("12") - - def test_fail_type(self): - """exfi.io.read_gfa1._overlap_str_to_int: not str""" - with self.assertRaises(TypeError): - _overlap_str_to_int(42) - - -class TestProcessSegments(TestCase): - """Tests of exfi.io.read_gfa1._process_segments""" - - def test_empty(self): - """exfi.io.read_gfa1._process_segments: empty case""" - self.assertEqual( - _process_segments(_split_lines(SEGMENTS_EMPTY)), - SEGMENTS_EMPTY_DICT - ) - - def test_simple(self): - """exfi.io.read_gfa1._process_segments: simple case""" - self.assertEqual( - _process_segments(_split_lines(SEGMENTS_SIMPLE)), - SEGMENTS_SIMPLE_DICT - ) - def test_complex(self): - """exfi.io.read_gfa1._process_segments: complex case""" - self.assertEqual( - _process_segments(_split_lines(SEGMENTS_COMPLEX)), - SEGMENTS_COMPLEX_DICT - ) - - - -class TestProcessLinks(TestCase): - """Tests of exfi.io.read_gfa1._process_links""" - - def test_empty(self): - """exfi.io.read_gfa1._process_links: empty case""" - self.assertEqual( - _process_links(_split_lines(LINKS_EMPTY)), - LINKS_EMPTY_DICT - ) - - def test_simple(self): - """exfi.io.read_gfa1._process_links: simple case""" - self.assertEqual( - _process_links(_split_lines(LINKS_SIMPLE)), - LINKS_SIMPLE_DICT - ) - def test_complex(self): - """exfi.io.read_gfa1._process_links: complex case""" - self.assertEqual( - _process_links(_split_lines(LINKS_COMPLEX)), - LINKS_COMPLEX_DICT - ) - - - -class TestProcessContainments(TestCase): - """Tests of exfi.io.read_gfa1._process_containments""" - - def test_empty(self): - """exfi.io.read_gfa1._process_containments: empty case""" - self.assertEqual( - _process_containments(_split_lines(CONTAINMENTS_EMPTY)), - CONTAINMENTS_EMPTY_DICT - ) - - def test_simple(self): - """exfi.io.read_gfa1._process_containments: simple case""" - self.assertEqual( - _process_containments(_split_lines(CONTAINMENTS_SIMPLE)), - CONTAINMENTS_SIMPLE_DICT - ) - def test_complex(self): - """exfi.io.read_gfa1._process_containments: complex case""" - self.assertEqual( - _process_containments(_split_lines(CONTAINMENTS_COMPLEX)), - CONTAINMENTS_COMPLEX_DICT - ) - - - -class TestProcessPaths(TestCase): - """Tests of exfi.io.read_gfa1._process_paths""" - - def test_empty(self): - """exfi.io.read_gfa1._process_paths: empty case""" - self.assertEqual( - _process_paths(_split_lines(PATHS_EMPTY)), - PATHS_EMPTY_DICT - ) - - def test_simple(self): - """exfi.io.read_gfa1._process_paths: simple case""" - self.assertEqual( - _process_paths(_split_lines(PATHS_SIMPLE)), - PATHS_SIMPLE_DICT - ) - def test_complex(self): - """exfi.io.read_gfa1._process_paths: complex case""" - self.assertEqual( - _process_paths(_split_lines(PATHS_COMPLEX)), - PATHS_COMPLEX_DICT - ) - - -class TestReadGFA1(TestCase): - """Tests for exfi.io.read_gfa1.read_gfa1""" - - def test_empty(self): - """exfi.io.read_gfa1.read_gfa1: empty case""" - self.assertEqual( - read_gfa1(GFA_EMPTY_FN), - { - "header": "VN:Z:1.0", - "segments": SEGMENTS_EMPTY_DICT, - "links": LINKS_EMPTY_DICT, - "containments": CONTAINMENTS_EMPTY_DICT, - "paths": PATHS_EMPTY_DICT - } - ) - - def test_simple(self): - """exfi.io.read_gfa1.read_gfa1: simple case""" - self.assertEqual( - read_gfa1(GFA_SIMPLE_FN), - { - "header": "VN:Z:1.0", - "segments": SEGMENTS_SIMPLE_DICT, - "links": LINKS_SIMPLE_DICT, - "containments": CONTAINMENTS_SIMPLE_DICT, - "paths": PATHS_SIMPLE_DICT - } - ) - - def test_complex(self): - """exfi.io.read_gfa1.read_gfa1: complex case""" - self.assertEqual( - read_gfa1(GFA_COMPLEX_FN), - { - "header": "VN:Z:1.0", - "segments": SEGMENTS_COMPLEX_DICT, - "links": LINKS_COMPLEX_DICT, - "containments": CONTAINMENTS_COMPLEX_DICT, - "paths": PATHS_COMPLEX_DICT - } - ) - - - -if __name__ == "__main__": - main() From 7354883887b40a3d67ff0e67d22d5965f742aa1f Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 16 Jan 2019 15:23:13 +0100 Subject: [PATCH 28/45] Deleted tests.data. Modified files accordingly --- tests/data.py | 346 --------------------------------------- tests/test_find_exons.py | 45 ++--- tests/test_polish.py | 2 +- 3 files changed, 14 insertions(+), 379 deletions(-) delete mode 100644 tests/data.py diff --git a/tests/data.py b/tests/data.py deleted file mode 100644 index 1366bae..0000000 --- a/tests/data.py +++ /dev/null @@ -1,346 +0,0 @@ -#!/usr/bin/env python3 - -""" -Constant values for testing -""" - -import pandas as pd -import networkx as nx - -from exfi.io.fasta_to_dict import \ - fasta_to_dict - -BED6_COLS = ["chrom", "start", "end", "name", "score", "strand"] - - -BED3RECORDS_EMPTY_FN = "tests/find_exons/empty.bed" -BED3RECORDS_SIMPLE_FN = "tests/find_exons/simple.bed" -BED3RECORDS_COMPLEX_FN = "tests/find_exons/complex.bed" - -BED3RECORDS_EMPTY = [] -BED3RECORDS_SIMPLE = [ - ("ENSDART00000161035.1", 0, 326) -] -BED3RECORDS_COMPLEX = [ - ("ENSDART00000161035.1", 397, 472), - ("ENSDART00000165342.1", 1176, 1324), - ("ENSDART00000161035.1", 0, 326), - ("ENSDART00000165342.1", 125, 304), - ("ENSDART00000165342.1", 746, 851), - ("ENSDART00000165342.1", 974, 1097), - ("ENSDART00000165342.1", 854, 886), - ("ENSDART00000165342.1", 1098, 1175), - ("ENSDART00000165342.1", 5, 127), - ("ENSDART00000165342.1", 645, 746), - ("ENSDART00000165342.1", 317, 460), - ("ENSDART00000165342.1", 591, 650), - ("ENSDART00000165342.1", 459, 592), - ("ENSDART00000165342.1", 899, 953), - ("ENSDART00000161035.1", 477, 523) -] - -DATA_COMPLEX_PART1 = [ - ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472", 0, "+"], - ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326", 0, "+"], - ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523", 0, "+"], -] -DATA_COMPLEX_PART2 = [ - ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324", 0, "+"], - ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304", 0, "+"], - ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851", 0, "+"], - ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097", 0, "+"], - ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886", 0, "+"], - ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175", 0, "+"], - ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127", 0, "+"], - ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746", 0, "+"], - ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460", 0, "+"], - ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650", 0, "+"], - ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592", 0, "+"], - ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953", 0, "+"], -] - - -BED6DF_EMPTY = pd.DataFrame(columns=BED6_COLS) -BED6DF_SIMPLE = pd.DataFrame( - data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326", 0, "+")], - columns=BED6_COLS -) -BED6DF_COMPLEX = pd.DataFrame( - data=DATA_COMPLEX_PART1 + DATA_COMPLEX_PART2, - columns=BED6_COLS -)\ -.sort_values(BED6_COLS[0:3]) - -BED6DF_DICT_EMPTY = {} -BED6DF_DICT_SIMPLE = { - "ENSDART00000161035.1": pd.DataFrame( - data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326", 0, "+")], - columns=BED6_COLS - ) -} -BED6DF_DICT_COMPLEX = { - "ENSDART00000161035.1": pd.DataFrame( - data=DATA_COMPLEX_PART1, - columns=BED6_COLS - )\ - .sort_values(BED6_COLS[0:3]), - "ENSDART00000165342.1": pd.DataFrame( - data=DATA_COMPLEX_PART2, - columns=BED6_COLS - )\ - .sort_values(BED6_COLS[0:3]) -} - - -NODE2COORDS_COMPLEX_PART1 = { - "ENSDART00000161035.1:0-326": (("ENSDART00000161035.1", 0, 326), ), - "ENSDART00000161035.1:397-472": (("ENSDART00000161035.1", 397, 472), ), - "ENSDART00000161035.1:477-523": (("ENSDART00000161035.1", 477, 523), ), -} -NODE2COORDS_COMPLEX_PART2 = { - "ENSDART00000165342.1:5-127": (("ENSDART00000165342.1", 5, 127), ), - "ENSDART00000165342.1:125-304": (("ENSDART00000165342.1", 125, 304), ), - "ENSDART00000165342.1:317-460": (("ENSDART00000165342.1", 317, 460), ), - "ENSDART00000165342.1:459-592": (("ENSDART00000165342.1", 459, 592), ), - "ENSDART00000165342.1:591-650": (("ENSDART00000165342.1", 591, 650), ), - "ENSDART00000165342.1:645-746": (("ENSDART00000165342.1", 645, 746), ), - "ENSDART00000165342.1:746-851": (("ENSDART00000165342.1", 746, 851), ), - "ENSDART00000165342.1:854-886": (("ENSDART00000165342.1", 854, 886), ), - "ENSDART00000165342.1:899-953": (("ENSDART00000165342.1", 899, 953), ), - "ENSDART00000165342.1:974-1097": (("ENSDART00000165342.1", 974, 1097), ), - "ENSDART00000165342.1:1098-1175": (("ENSDART00000165342.1", 1098, 1175), ), - "ENSDART00000165342.1:1176-1324": (("ENSDART00000165342.1", 1176, 1324), ) -} - -NODE2COORDS_EMPTY = {} -NODE2COORDS_SIMPLE = { - "ENSDART00000161035.1:0-326": (("ENSDART00000161035.1", 0, 326), ) -} -NODE2COORDS_COMPLEX = {**NODE2COORDS_COMPLEX_PART1, **NODE2COORDS_COMPLEX_PART2} - -PATH_EMPTY = {} -PATH_SIMPLE = {"ENSDART00000161035.1": ("ENSDART00000161035.1:0-326",)} -PATH_COMPLEX = { - "ENSDART00000161035.1": tuple(NODE2COORDS_COMPLEX_PART1.keys()), - "ENSDART00000165342.1": tuple(NODE2COORDS_COMPLEX_PART2.keys()) -} - -OVERLAPS_COMPLEX_PART1 = { - ("ENSDART00000161035.1:0-326", "ENSDART00000161035.1:397-472"): -71, - ("ENSDART00000161035.1:397-472", "ENSDART00000161035.1:477-523"): -5 -} -OVERLAPS_COMPLEX_PART2 = { - ("ENSDART00000165342.1:5-127", "ENSDART00000165342.1:125-304"): 2, - ("ENSDART00000165342.1:125-304", "ENSDART00000165342.1:317-460"): -13, - ("ENSDART00000165342.1:317-460", "ENSDART00000165342.1:459-592"): 1, - ("ENSDART00000165342.1:459-592", "ENSDART00000165342.1:591-650"): 1, - ("ENSDART00000165342.1:591-650", "ENSDART00000165342.1:645-746"): 5, - ("ENSDART00000165342.1:645-746", "ENSDART00000165342.1:746-851"): 0, - ("ENSDART00000165342.1:746-851", "ENSDART00000165342.1:854-886"): -3, - ("ENSDART00000165342.1:854-886", "ENSDART00000165342.1:899-953"): -13, - ("ENSDART00000165342.1:899-953", "ENSDART00000165342.1:974-1097"): -21, - ("ENSDART00000165342.1:974-1097", "ENSDART00000165342.1:1098-1175"): -1, - ("ENSDART00000165342.1:1098-1175", "ENSDART00000165342.1:1176-1324"): -1 -} - - -OVERLAPS_EMPTY = {} -OVERLAPS_SIMPLE = {} -OVERLAPS_COMPLEX = {**OVERLAPS_COMPLEX_PART1, **OVERLAPS_COMPLEX_PART2} - -OVERLAPS_EMPTY_DICT = {} -OVERLAPS_SIMPLE_DICT = {'ENSDART00000161035.1': {}} -OVERLAPS_COMPLEX_DICT = { - "ENSDART00000161035.1": OVERLAPS_COMPLEX_PART1, - "ENSDART00000165342.1": OVERLAPS_COMPLEX_PART2 -} - - -SPLICE_GRAPH_EMPTY = nx.DiGraph() -SPLICE_GRAPH_SIMPLE = nx.DiGraph() -SPLICE_GRAPH_SIMPLE.add_nodes_from(BED6DF_SIMPLE["name"].tolist()) -nx.set_node_attributes( - G=SPLICE_GRAPH_SIMPLE, - name="coordinates", - values=NODE2COORDS_SIMPLE -) -for PATH in PATH_SIMPLE.values(): - SPLICE_GRAPH_SIMPLE.add_path(PATH) -nx.set_edge_attributes( - G=SPLICE_GRAPH_SIMPLE, - name="overlaps", - values=OVERLAPS_SIMPLE -) - -SPLICE_GRAPH_COMPLEX = nx.DiGraph() -SPLICE_GRAPH_COMPLEX.add_nodes_from(BED6DF_COMPLEX["name"].tolist()) -nx.set_node_attributes( - G=SPLICE_GRAPH_COMPLEX, name="coordinates", values=NODE2COORDS_COMPLEX -) -for PATH in PATH_COMPLEX.values(): - SPLICE_GRAPH_COMPLEX.add_path(PATH) -nx.set_edge_attributes( - G=SPLICE_GRAPH_COMPLEX, name="overlaps", values=OVERLAPS_COMPLEX -) - - -SPLICE_GRAPH_EMPTY_DICT = {} -SPLICE_GRAPH_SIMPLE_DICT = { - "ENSDART00000161035.1": SPLICE_GRAPH_SIMPLE, -} - - -SPLICE_GRAPH_1 = nx.DiGraph() -SPLICE_GRAPH_1.add_nodes_from(NODE2COORDS_COMPLEX_PART1.keys()) -nx.set_node_attributes(G=SPLICE_GRAPH_1, name="coordinates", values=NODE2COORDS_COMPLEX_PART1) -SPLICE_GRAPH_1.add_edges_from(OVERLAPS_COMPLEX_DICT["ENSDART00000161035.1"].keys()) -nx.set_edge_attributes( - G=SPLICE_GRAPH_1, name="overlaps", values=OVERLAPS_COMPLEX_DICT["ENSDART00000161035.1"] -) - -SPLICE_GRAPH_2 = nx.DiGraph() -SPLICE_GRAPH_2.add_nodes_from(NODE2COORDS_COMPLEX_PART2.keys()) -nx.set_node_attributes( - G=SPLICE_GRAPH_2, name="coordinates", values=NODE2COORDS_COMPLEX_PART2) -SPLICE_GRAPH_2.add_edges_from(OVERLAPS_COMPLEX_DICT["ENSDART00000165342.1"].keys()) -nx.set_edge_attributes( - G=SPLICE_GRAPH_2, name="overlaps", values=OVERLAPS_COMPLEX_DICT["ENSDART00000165342.1"] -) - -SPLICE_GRAPH_COMPLEX_DICT = { - "ENSDART00000161035.1": SPLICE_GRAPH_1, - "ENSDART00000165342.1": SPLICE_GRAPH_2 -} - - - - -TRANSCRIPTOME_EMPTY_FN = "tests/build_splice_graph/transcriptome_empty.fa" -TRANSCRIPTOME_SIMPLE_FN = "tests/build_splice_graph/transcriptome_simple.fa" -TRANSCRIPTOME_COMPLEX_FN = "tests/build_splice_graph/transcriptome_complex.fa" - -TRANSCRIPTOME_EMPTY_DICT = fasta_to_dict(TRANSCRIPTOME_EMPTY_FN) -TRANSCRIPTOME_SIMPLE_DICT = fasta_to_dict(TRANSCRIPTOME_SIMPLE_FN) -TRANSCRIPTOME_COMPLEX_DICT = fasta_to_dict(TRANSCRIPTOME_COMPLEX_FN) - -SEGMENTS_EMPTY = [] -SEGMENTS_SIMPLE = [ - "S\t" - "ENSDART00000161035.1:0-326\t" - "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCC" - "AATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCT" - "CTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTG" - "CGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA\t" - "LN:i:326\n" -] -SEGMENTS_COMPLEX = [ - "S\tENSDART00000161035.1:0-326\tTGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACAT" - "TTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCAT" - "GTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCA" - "TTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA\tLN:i:326\n", - "S\tENSDART00000161035.1:397-472\tAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTC" - "AGTCCAAATCAACA\tLN:i:75\n", - "S\tENSDART00000161035.1:477-523\tAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA\tLN:i:46\n", - "S\tENSDART00000165342.1:5-127\tTGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGG" - "AAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG\tLN:i:122\n", - "S\tENSDART00000165342.1:125-304\tAGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGC" - "AAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAAT" - "GGCAAGGAGTTAGAGATGCCACCA\tLN:i:179\n", - "S\tENSDART00000165342.1:317-460\tGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAG" - "GTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAG\tLN:i:143\n" - , - "S\tENSDART00000165342.1:459-592\tGTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCT" - "CTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA\tLN:i:133\n", - "S\tENSDART00000165342.1:591-650\tAGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT\t" - "LN:i:59\n", - "S\tENSDART00000165342.1:645-746\tGCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCA" - "TCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT\tLN:i:101\n", - "S\tENSDART00000165342.1:746-851\tGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCC" - "TTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA\tLN:i:105\n", - "S\tENSDART00000165342.1:854-886\tTGCAGCCAAACAATGCAACTGTGACAGCAGCA\tLN:i:32\n", - "S\tENSDART00000165342.1:899-953\tTGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA\tLN:i:" - "54\n", - "S\tENSDART00000165342.1:974-1097\tCGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGA" - "GTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA\tLN:i:123\n", - "S\tENSDART00000165342.1:1098-1175\tTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTT" - "GACGCTCACATATCCTGA\tLN:i:77\n", - "S\tENSDART00000165342.1:1176-1324\tCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGA" - "CCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG\tLN:" - "i:148\n" -] - -LINKS_EMPTY = [] -LINKS_SIMPLE = [] -LINKS_COMPLEX = [ - "L\tENSDART00000161035.1:0-326\t+\tENSDART00000161035.1:397-472\t+\t71G\n", - "L\tENSDART00000161035.1:397-472\t+\tENSDART00000161035.1:477-523\t+\t5G\n", - "L\tENSDART00000165342.1:5-127\t+\tENSDART00000165342.1:125-304\t+\t2M\n", - "L\tENSDART00000165342.1:125-304\t+\tENSDART00000165342.1:317-460\t+\t13G\n", - "L\tENSDART00000165342.1:317-460\t+\tENSDART00000165342.1:459-592\t+\t1M\n", - "L\tENSDART00000165342.1:459-592\t+\tENSDART00000165342.1:591-650\t+\t1M\n", - "L\tENSDART00000165342.1:591-650\t+\tENSDART00000165342.1:645-746\t+\t5M\n", - "L\tENSDART00000165342.1:645-746\t+\tENSDART00000165342.1:746-851\t+\t0M\n", - "L\tENSDART00000165342.1:746-851\t+\tENSDART00000165342.1:854-886\t+\t3G\n", - "L\tENSDART00000165342.1:854-886\t+\tENSDART00000165342.1:899-953\t+\t13G\n", - "L\tENSDART00000165342.1:899-953\t+\tENSDART00000165342.1:974-1097\t+\t21G\n", - "L\tENSDART00000165342.1:974-1097\t+\tENSDART00000165342.1:1098-1175\t+\t1G\n", - "L\tENSDART00000165342.1:1098-1175\t+\tENSDART00000165342.1:1176-1324\t+\t1G\n" -] - -CONTAINMENTS_EMPTY = [] -CONTAINMENTS_SIMPLE = [ - "C\tENSDART00000161035.1\t+\tENSDART00000161035.1:0-326\t+\t0\t326M\n" -] -CONTAINMENTS_COMPLEX = [ - "C\tENSDART00000161035.1\t+\tENSDART00000161035.1:0-326\t+\t0\t326M\n", - "C\tENSDART00000161035.1\t+\tENSDART00000161035.1:397-472\t+\t397\t75M\n", - "C\tENSDART00000161035.1\t+\tENSDART00000161035.1:477-523\t+\t477\t46M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:5-127\t+\t5\t122M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:125-304\t+\t125\t179M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:317-460\t+\t317\t143M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:459-592\t+\t459\t133M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:591-650\t+\t591\t59M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:645-746\t+\t645\t101M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:746-851\t+\t746\t105M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:854-886\t+\t854\t32M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:899-953\t+\t899\t54M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:974-1097\t+\t974\t123M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:1098-1175\t+\t1098\t77M\n", - "C\tENSDART00000165342.1\t+\tENSDART00000165342.1:1176-1324\t+\t1176\t148M\n" -] - -PATHS_EMPTY = [] -PATHS_SIMPLE = [ - "P\tENSDART00000161035.1\tENSDART00000161035.1:0-326+\n" -] -PATHS_COMPLEX = [ - "P\tENSDART00000161035.1\tENSDART00000161035.1:0-326+,ENSDART00000161035.1:397-472+,ENSDART0000" - "0161035.1:477-523+\n", - "P\tENSDART00000165342.1\tENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+,ENSDART0000" - "0165342.1:317-460+,ENSDART00000165342.1:459-592+,ENSDART00000165342.1:591-650+,ENSDART00000165" - "342.1:645-746+,ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+,ENSDART00000165342." - "1:899-953+,ENSDART00000165342.1:974-1097+,ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1" - ":1176-1324+\n" -] - - - -SEGMENTS_EMPTY_DICT = fasta_to_dict("/dev/null") -SEGMENTS_SIMPLE_DICT = fasta_to_dict("tests/io/exons_simple.fa") -SEGMENTS_COMPLEX_DICT = fasta_to_dict("tests/io/exons_complex.fa") - -LINKS_EMPTY_DICT = OVERLAPS_EMPTY -LINKS_SIMPLE_DICT = OVERLAPS_SIMPLE -LINKS_COMPLEX_DICT = OVERLAPS_COMPLEX - -CONTAINMENTS_EMPTY_DICT = NODE2COORDS_EMPTY -CONTAINMENTS_SIMPLE_DICT = NODE2COORDS_SIMPLE -CONTAINMENTS_COMPLEX_DICT = NODE2COORDS_COMPLEX - -PATHS_EMPTY_DICT = PATH_EMPTY -PATHS_SIMPLE_DICT = PATH_SIMPLE -PATHS_COMPLEX_DICT = PATH_COMPLEX - -GFA_EMPTY_FN = "tests/io/empty.gfa" -GFA_SIMPLE_FN = "tests/io/simple.gfa" -GFA_COMPLEX_FN = "tests/io/complex.gfa" diff --git a/tests/test_find_exons.py b/tests/test_find_exons.py index d2df1ca..2e826de 100644 --- a/tests/test_find_exons.py +++ b/tests/test_find_exons.py @@ -7,9 +7,6 @@ import unittest -import numpy as np -import pandas as pd - from exfi.io.fasta_to_dict import \ fasta_to_dict @@ -22,27 +19,9 @@ from tests.custom_assertions import \ CustomAssertions -from tests.data import \ - BED3RECORDS_EMPTY, BED3RECORDS_SIMPLE, BED3RECORDS_COMPLEX, \ - BED3RECORDS_EMPTY_FN, BED3RECORDS_SIMPLE_FN, BED3RECORDS_COMPLEX_FN - -def create_bed_from_lists(lists): - """tests.find_exons_pipeline.create_bed_from_lists: convert list of lists - to a BED3 dataframe""" - bed3 = pd.DataFrame( - data=lists, - columns=["chrom", "chromStart", "chromEnd"] - ) - - bed3.chromStart.astype(np.int64) - bed3.chromEnd.astype(np.int64) - return bed3 - - - -BED3DF_EMPTY = create_bed_from_lists(BED3RECORDS_EMPTY) -BED3DF_SIMPLE = create_bed_from_lists(BED3RECORDS_SIMPLE) -BED3DF_COMPLEX = create_bed_from_lists(BED3RECORDS_COMPLEX) +from tests.io.bed import \ + BED3_EMPTY, BED3_SIMPLE, BED3_COMPLEX, \ + BED3_EMPTY_FN, BED3_SIMPLE_FN, BED3_COMPLEX_FN @@ -51,23 +30,23 @@ class TestProcessOutput(unittest.TestCase): def test_empty_process(self): """exfi.find_exons._command_to_list: process an empty stream""" - results = _command_to_list(["cat", BED3RECORDS_EMPTY_FN]) + results = _command_to_list(["cat", BED3_EMPTY_FN]) self.assertTrue(results.shape == (0, 3)) def test_simple_process(self): """exfi.find_exons._command_to_list: process an simple stream""" - results = _command_to_list(["cat", BED3RECORDS_SIMPLE_FN]) + results = _command_to_list(["cat", BED3_SIMPLE_FN]) print("Observed:\n", results) - print("Expected:\n", BED3DF_SIMPLE) - self.assertTrue(results.equals(BED3DF_SIMPLE)) + print("Expected:\n", BED3_SIMPLE) + self.assertTrue(results.equals(BED3_SIMPLE)) def test_big_process(self): """exfi.find_exons._command_to_list: process an big stream""" - results = _command_to_list(["cat", BED3RECORDS_COMPLEX_FN]) + results = _command_to_list(["cat", BED3_COMPLEX_FN]) print("Observed:\n", results, results.dtypes) - print("Expected:\n", BED3DF_COMPLEX, BED3DF_COMPLEX.dtypes) - self.assertTrue(results.equals(BED3DF_COMPLEX)) + print("Expected:\n", BED3_COMPLEX, BED3_COMPLEX.dtypes) + self.assertTrue(results.equals(BED3_COMPLEX)) @@ -136,6 +115,7 @@ def test_multi_seqs_multi_beds(self): ) + class TestFindExonsPipeline(unittest.TestCase): """Tests for find_exons_pipeline""" @@ -146,7 +126,7 @@ def test_notranscriptome_noreads(self): transcriptome_fn = "/dev/null" results = _bf_and_process(reads_fns, transcriptome_fn) print("Observed:\n", results) - print("Expected:\n", BED3DF_EMPTY) + print("Expected:\n", BED3_EMPTY) self.assertEqual(results.shape, (0, 3)) def test_transcriptome_noreads(self): @@ -169,5 +149,6 @@ def test_small_data(self): self.assertEqual(results.shape, (0, 3)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_polish.py b/tests/test_polish.py index a2d795f..1415242 100644 --- a/tests/test_polish.py +++ b/tests/test_polish.py @@ -11,7 +11,7 @@ from exfi.polish import \ polish_bed4 -from tests.data import \ +from tests.io.transcriptome_dicts import \ TRANSCRIPTOME_EMPTY_DICT, \ TRANSCRIPTOME_SIMPLE_DICT, \ TRANSCRIPTOME_COMPLEX_DICT From 07be0a4e45f831bf42cb97ca025135caa7fa1346 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 16 Jan 2019 19:02:28 +0100 Subject: [PATCH 29/45] masking + tests --- exfi/io/masking.py | 142 +++++++++++++++--------------- tests/io/bed.py | 161 +++++++++++++++++++++++++++------- tests/test_io/test_masking.py | 126 +++----------------------- tests/test_polish.py | 61 +------------ 4 files changed, 220 insertions(+), 270 deletions(-) diff --git a/exfi/io/masking.py b/exfi/io/masking.py index 9f5ecf0..c78ea2c 100644 --- a/exfi/io/masking.py +++ b/exfi/io/masking.py @@ -6,91 +6,95 @@ import logging - -def _process_overlap_cigar(cigar_string: str) -> list: - """Process a simple CIGAR string. - - :param cigar_string: Process a simple CIGAR string of the shape number-letter: 90G, 10M, ... +import pandas as pd + +def soft_mask(sequence, left, right): + """Lowercase the first left bases and last right bases of sequence + + >>> soft_mask('ACCGATCGATCGTAG', 2, 1) + 'acCGATCGATCGTAg' + >>> soft_mask('ACCGATCGATCGTAG', 0, 2) + 'ACCGATCGATCGTag' + >>> soft_mask('ACCGATCGATCGTAG', 2, 0) + 'acCGATCGATCGTAG' + >>> soft_mask('ACCGATCGATCGTAG', 0, 0) + 'ACCGATCGATCGTAG' """ - return [cigar_string[-1], int(cigar_string[:-1])] - - -def _soft_mask_right(string: str, n_bases: int) -> str: - """Soft mask the rightmost n bases. - - :param str string: string of nucleotides to be soft masked. - :param int n_bases: number of letters at the right end (3') to be soft masked. - """ - return string[:-n_bases] + string[-n_bases:].lower() - - -def _soft_mask_left(string: str, n_bases: int) -> str: - """Soft mask the leftmost n bases. - - :param str string: string of nucleotides to be soft masked - :param int n_bases: number of letters at the left end (5') to be soft masked. + if left == 0 and right == 0: + return sequence + if left == 0 and right > 0: + return sequence[:-right] + sequence[-right:].lower() + if left > 0 and right == 0: + return sequence[:left].lower() + sequence[left:] + return sequence[:left].lower() + sequence[left:-right] + sequence[-right:].lower() + + + +def hard_mask(sequence, left, right): + """Mask with N the first left bases and last right bases of sequence + + >>> hard_mask('ACCGATCGATCGTAG', 2, 1) + 'NNCGATCGATCGTAN' + >>> hard_mask('ACCGATCGATCGTAG', 0, 2) + 'ACCGATCGATCGTNN' + >>> hard_mask('ACCGATCGATCGTAG', 2, 0) + 'NNCGATCGATCGTAG' + >>> hard_mask('ACCGATCGATCGTAG', 0, 0) + 'ACCGATCGATCGTAG' """ - return string[:n_bases].lower() + string[n_bases:] + if left == 0 and right == 0: + return sequence + if left == 0 and right > 0: + return sequence[:-right] + 'N' * right + if left > 0 and right == 0: + return 'N' * left + sequence[left:] + return 'N' * left + sequence[left:-right] + 'N' * right -def _soft_mask(exon_dict, overlap_dict): - """Soft mask all overlaps in the exon_dict. - :param dict exon_dict: dict of exon_id: sequence - :param dict overlap_dict: dict of (node1, node2): overlap - """ - exon_dict = exon_dict.copy() - for (start, end), overlap in overlap_dict.items(): - if overlap > 0: - exon_dict[start] = _soft_mask_right(exon_dict[start], overlap) - exon_dict[end] = _soft_mask_left(exon_dict[end], overlap) - return exon_dict - - -def _hard_mask_right(string: str, n_bases: int) -> str: - """Hard mask the rightmost n_bases bases +def mask(node2sequence, edge2overlap, masking: str = "none"): + """If any of the soft mask or hard mask are activated, mask - :param string: Nucleotide sequence to hard mask. - :param n_bases: Number of bases to hard mask at the right (3') end. + :param dict exon_dict: Dict of the shape exon_id: sequence. + :param dict overlap_dict: Dict of the shape (exon1, exon2): overlap between them. + :param str masking: Type of masking to apply. Options: hard, soft, none + (Default value = "None") . """ - return string[:-n_bases] + "N" * n_bases + if masking == 'none': + return node2sequence + edge2overlap['tmp_overlap'] = edge2overlap.overlap.map( + lambda x: x if x > 0 else 0 + ) -def _hard_mask_left(string: str, n_bases: int): - """Hard mask the leftmost n_bases bases - - :param str string: Nucleotide sequence to hard mask. - :param int n_bases: Number of bases to hard mask at the left (5') end. - """ - return "N" * n_bases + string[n_bases:] + tmp = pd.merge( + node2sequence, + edge2overlap[['u', 'tmp_overlap']].rename( + columns={'u': 'name', 'tmp_overlap': 'mask_right'} + ), + on=['name'] + ) + complete = complete = pd.merge( + tmp, + edge2overlap[['v', 'tmp_overlap']].rename( + columns={'v': 'name', 'tmp_overlap': 'mask_left'} + ), + on=['name'] + ) -def _hard_mask(exon_dict, overlap_dict): - """Hard mask all overlaps in the exon_dict. - - :param dict exon_dict: Dict of the shape exon_id: sequence. - :param dict overlap_dict: Dict of the shape (exon1, exon2): overlap between them. - """ - exon_dict = exon_dict.copy() - for (start, end), overlap in overlap_dict.items(): - if overlap > 0: - exon_dict[start] = _hard_mask_right(exon_dict[start], overlap) - exon_dict[end] = _hard_mask_left(exon_dict[end], overlap) - return exon_dict + complete['tmp'] = tuple(zip( + complete.sequence, complete.mask_left, complete.mask_right + )) -def _mask(exon_dict, overlap_dict, masking: str = "none"): - """If any of the soft mask or hard mask are activated, mask - :param dict exon_dict: Dict of the shape exon_id: sequence. - :param dict overlap_dict: Dict of the shape (exon1, exon2): overlap between them. - :param str masking: Type of masking to apply. Options: hard, soft, none (Default value = "None") - . - """ if masking == "hard": logging.info("\tHard masking sequences") - exon_dict = _hard_mask(exon_dict, overlap_dict) + complete['sequence'] = complete.tmp.map(lambda x: hard_mask(*x)) elif masking == "soft": logging.info("\tSoft masking sequences") - exon_dict = _soft_mask(exon_dict, overlap_dict) + complete['sequence'] = complete.tmp.map(lambda x: soft_mask(*x)) + + exon_dict = complete[['name', 'sequence']].reset_index(drop=True) return exon_dict diff --git a/tests/io/bed.py b/tests/io/bed.py index 0cdeb56..4c91da2 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -5,22 +5,26 @@ import pandas as pd import numpy as np +BED3_COLUMNS = ['chrom', 'chromStart', 'chromEnd'] +BED3_DTYPES = {'chrom': np.str, 'chromStart': np.int64, 'chromEnd': np.int64} + +BED4_COLUMNS = ['chrom', 'chromStart', 'chromEnd', 'name'] +BED4_DTYPES = { + 'chrom': np.str, 'chromStart': np.int64, 'chromEnd': np.int64, 'name':np.str +} + + BED3_EMPTY_FN = "tests/io/empty.bed" BED3_SIMPLE_FN = "tests/io/simple.bed" BED3_COMPLEX_FN = "tests/io/complex.bed" -BED3_EMPTY = pd.DataFrame( - data=None, - columns=["chrom", "chromStart", "chromEnd"] -) -BED3_EMPTY = BED3_EMPTY.astype( - {"chrom": str, "chromStart": np.int64, "chromEnd": np.int64} -) +BED3_EMPTY = pd.DataFrame(columns=BED3_COLUMNS) +BED3_EMPTY = BED3_EMPTY.astype(BED3_DTYPES) BED3_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326)], - columns=["chrom", "chromStart", "chromEnd"] + columns=BED3_COLUMNS ) BED3_COMPLEX = pd.DataFrame( @@ -41,23 +45,18 @@ ["ENSDART00000165342.1", 1098, 1175], ["ENSDART00000165342.1", 1176, 1324] ], - columns=["chrom", "chromStart", "chromEnd"] + columns=BED3_COLUMNS ) -BED4_EMPTY = pd.DataFrame( - data=None, - columns=["chrom", "chromStart", "chromEnd", "name"] -) -BED4_EMPTY = BED4_EMPTY.astype( - {"chrom": str, "chromStart": np.int64, "chromEnd": np.int64, "name": str} -) +BED4_EMPTY = pd.DataFrame(columns=BED4_COLUMNS) +BED4_EMPTY = BED4_EMPTY.astype(BED4_DTYPES) BED4_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], - columns=["chrom", "chromStart", "chromEnd", "name"] + columns=BED4_COLUMNS ) BED4_COMPLEX = pd.DataFrame( @@ -78,7 +77,7 @@ ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] ], - columns=["chrom", "chromStart", "chromEnd", "name"] + columns=BED4_COLUMNS ) @@ -86,16 +85,14 @@ NODE2COORDINATES_EMPTY = pd.DataFrame( data=None, - columns=["chrom", "chromStart", "chromEnd", "name"] -).set_index("name") -NODE2COORDINATES_EMPTY = NODE2COORDINATES_EMPTY.astype( - {"chromStart": np.int64, "chromEnd": np.int64} -) - + columns=BED4_COLUMNS +)\ +.astype(BED4_DTYPES)\ +.set_index("name") NODE2COORDINATES_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], - columns=["chrom", "chromStart", "chromEnd", "name"] + columns=BED4_COLUMNS ).set_index("name") NODE2COORDINATES_COMPLEX = pd.DataFrame( @@ -116,7 +113,7 @@ ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] ], - columns=["chrom", "chromStart", "chromEnd", "name"] + columns=BED4_COLUMNS ).set_index("name") @@ -174,7 +171,7 @@ ], [ "ENSDART00000161035.1:477-523", "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" - ], [ + ], [ "ENSDART00000165342.1:5-127", "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAA" "TCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG" @@ -309,9 +306,9 @@ ['ENSDART00000167898', 424, 488], ['ENSDART00000167898', 488, 605] ], - columns=["chrom", "chromStart", "chromEnd"] + columns=BED3_COLUMNS ) -BED3_NCBI = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd"]) +# BED3_NCBI = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd"]) BED3_GMAP = pd.DataFrame( data=[ ['ENSDART00000171570', 0, 61], @@ -391,5 +388,109 @@ ['ENSDART00000172182', 61, 413], ['ENSDART00000172374', 0, 355] ], - columns=["chrom", "chromStart", "chromEnd"] + columns=BED3_COLUMNS +) + + + +BED4_SIMPLE_POLISHED = BED4_SIMPLE + +BED4_COMPLEX_POLISHED = pd.DataFrame( + data=[ + ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], + ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], + ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], + ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], + ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], + ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], + ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], + ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], + ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], + ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], + ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], + ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"], + ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], + ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], + ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] + ], + columns=["chrom", "chromStart", "chromEnd", "name"] +) + + + +NODE2SEQUENCE_COMPLEX_SOFT = pd.DataFrame( + data=[[ + 'ENSDART00000161035.1:397-472', + 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA' + ], [ + 'ENSDART00000165342.1:125-304', + 'agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' + ], [ + 'ENSDART00000165342.1:317-460', + 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg' + ], [ + 'ENSDART00000165342.1:459-592', + 'gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa' + ], [ + 'ENSDART00000165342.1:591-650', + 'aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct' + ], [ + 'ENSDART00000165342.1:645-746', + 'gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT' + ], [ + 'ENSDART00000165342.1:746-851', + 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' + ], [ + 'ENSDART00000165342.1:854-886', + 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' + ], [ + 'ENSDART00000165342.1:899-953', + 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' + ], [ + 'ENSDART00000165342.1:974-1097', + 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' + ], [ + 'ENSDART00000165342.1:1098-1175', + 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA' + ]], + columns=['name', 'sequence'] +) + +NODE2SEQUENCE_COMPLEX_HARD = pd.DataFrame( + data=[[ + 'ENSDART00000161035.1:397-472', + 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA' + ], [ + 'ENSDART00000165342.1:125-304', + 'NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' + ], [ + 'ENSDART00000165342.1:317-460', + 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN' + ], [ + 'ENSDART00000165342.1:459-592', + 'NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN' + ], [ + 'ENSDART00000165342.1:591-650', + 'NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN' + ], [ + 'ENSDART00000165342.1:645-746', + 'NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT' + ], [ + 'ENSDART00000165342.1:746-851', + 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' + ], [ + 'ENSDART00000165342.1:854-886', + 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' + ], [ + 'ENSDART00000165342.1:899-953', + 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' + ], [ + 'ENSDART00000165342.1:974-1097', + 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' + ], [ + 'ENSDART00000165342.1:1098-1175', + 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA' + ]], + columns=["name", "sequence"] + ) diff --git a/tests/test_io/test_masking.py b/tests/test_io/test_masking.py index 066fd24..5dd7c30 100644 --- a/tests/test_io/test_masking.py +++ b/tests/test_io/test_masking.py @@ -7,133 +7,33 @@ from unittest import TestCase, main from exfi.io.masking import \ - _process_overlap_cigar, \ - _soft_mask_right, \ - _soft_mask_left, \ - _soft_mask, \ - _hard_mask_right, \ - _hard_mask_left, \ - _hard_mask, \ - _mask + mask -from tests.data import \ - OVERLAPS_COMPLEX +from tests.io.bed import \ + EDGE2OVERLAP_COMPLEX, NODE2SEQUENCE_COMPLEX, NODE2SEQUENCE_COMPLEX_HARD, \ + NODE2SEQUENCE_COMPLEX_SOFT -# pylint: disable=no-name-in-module -from tests.test_io.test_gfa1_to_exons import \ - EXONS_COMPLEX_DICT, EXONS_COMPLEX_SOFT_DICT, EXONS_COMPLEX_HARD_DICT -from tests.custom_assertions import CustomAssertions - -class TestProcessOverlapCigar(TestCase): - """Tests for exfi.io.masking._process_overlap_cigar""" - - def test_empty(self): - """exfi.io.masking._process_overlap_cigar: empty case""" - with self.assertRaises(IndexError): - _process_overlap_cigar("") - - def test_wrong(self): - """exfi.io.masking._process_overlap_cigar: wrong case""" - with self.assertRaises(IndexError): - _process_overlap_cigar("") - - def test_correct(self): - """exfi.io.masking._process_overlap_cigar: correct case""" - self.assertEqual( - _process_overlap_cigar("13M"), - ["M", 13] - ) - - - -class TestSoftMaskRight(TestCase): - """Tests for exfi.io.masking._soft_mask_right""" - - def test_soft_mask_right(self): - """exfi.io.masking._soft_mask_right: simple""" - actual = _soft_mask_right("AAAAA", 3) - expected = "AAaaa" - self.assertEqual(actual, expected) - - - -class TestSoftMaskLeft(TestCase): - """Tests for exfi.io.masking._soft_mask_left""" - - def test_soft_mask_left(self): - """exfi.io.masking._soft_mask_left: simple""" - actual = _soft_mask_left("AAAAA", 3) - expected = "aaaAA" - self.assertEqual(actual, expected) - - - -class TestSoftMask(TestCase, CustomAssertions): - """Tests for exfi.io.masking._soft_mask""" - - def test_soft_mask(self): - """exfi.io.masking._soft_mask: simple""" - actual = _soft_mask(EXONS_COMPLEX_DICT, OVERLAPS_COMPLEX) - expected = EXONS_COMPLEX_SOFT_DICT - self.assertEqualDict(actual, expected) - - - -class TestHardMaskRight(TestCase): - """Tests for exfi.io.masking._hard_mask_right""" - - def test_hard_mask_right(self): - """exfi.io.masking._hard_mask_right: simple""" - actual = _hard_mask_right("AAAAA", 3) - expected = "AANNN" - self.assertEqual(actual, expected) - - - -class TestHardMaskLeft(TestCase): - """Tests for exfi.io.masking._hard_mask_left""" - - def test_hard_mask_left(self): - """exfi.io.masking._hard_mask_left: simple""" - actual = _hard_mask_left("AAAAA", 3) - expected = "NNNAA" - self.assertEqual(actual, expected) - - - -class TestHardMask(TestCase, CustomAssertions): - """Tests for exfi.io.masking._hard_mask""" - - def test_hard_mask(self): - """exfi.io.masking._hard_mask: simple""" - actual = _hard_mask(EXONS_COMPLEX_DICT, OVERLAPS_COMPLEX) - expected = EXONS_COMPLEX_HARD_DICT - self.assertEqualDict(actual, expected) - - - -class TestMask(TestCase, CustomAssertions): +class TestMask(TestCase): """Tests for exfi.io.masking._mask""" def test_no_mask(self): """exfi.io.masking._mask: no masking""" - actual = _mask(EXONS_COMPLEX_DICT, OVERLAPS_COMPLEX, "none") - expected = EXONS_COMPLEX_DICT - self.assertEqualDict(actual, expected) + observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "none") + self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX)) def test_soft_mask(self): """exfi.io.masking._mask: soft masking""" - actual = _mask(EXONS_COMPLEX_DICT, OVERLAPS_COMPLEX, "soft") - expected = EXONS_COMPLEX_SOFT_DICT - self.assertEqualDict(actual, expected) + observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "soft") + print(observed.values.tolist()) + self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX_SOFT)) def test_hard_mask(self): """exfi.io.masking._mask: hard masking""" - actual = _mask(EXONS_COMPLEX_DICT, OVERLAPS_COMPLEX, "hard") - expected = EXONS_COMPLEX_HARD_DICT - self.assertEqualDict(actual, expected) + observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "hard") + print(observed.values.tolist()) + self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX_HARD)) diff --git a/tests/test_polish.py b/tests/test_polish.py index 1415242..4a35b3d 100644 --- a/tests/test_polish.py +++ b/tests/test_polish.py @@ -6,8 +6,6 @@ TestCase, \ main -import pandas as pd - from exfi.polish import \ polish_bed4 @@ -16,62 +14,9 @@ TRANSCRIPTOME_SIMPLE_DICT, \ TRANSCRIPTOME_COMPLEX_DICT - -# Test data -BED4_EMPTY = pd.DataFrame( - data=None, columns=["chrom", "chromStart", "chromEnd", "name"] -) - -BED4_SIMPLE = pd.DataFrame( - data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], - columns=["chrom", "chromStart", "chromEnd", "name"] -) - -BED4_COMPLEX = pd.DataFrame( - data=[ - ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], - ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], - ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], - ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"], - ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], - ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], - ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], - ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], - ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], - ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], - ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], - ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], - ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], - ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], - ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"] - ], - columns=["chrom", "chromStart", "chromEnd", "name"] -) - - -BED4_SIMPLE_POLISHED = BED4_SIMPLE -BED4_COMPLEX_POLISHED = pd.DataFrame( - data=[ - ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], - ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], - ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], - ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"], - ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], - ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], - ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], - ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], - ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], - ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], - ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], - ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], - ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], - ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], - ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"] - ], - columns=["chrom", "chromStart", "chromEnd", "name"] -) - - +from tests.io.bed import \ + BED4_EMPTY, BED4_SIMPLE, BED4_COMPLEX, \ + BED4_SIMPLE_POLISHED, BED4_COMPLEX_POLISHED From bad901fab357e1f9fc54dc5db2d13af389c6c59e Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 17 Jan 2019 10:38:40 +0100 Subject: [PATCH 30/45] Codacy: removed duplicated code and splitted long lines --- tests/io/bed.py | 85 +++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/tests/io/bed.py b/tests/io/bed.py index 4c91da2..5433702 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -83,38 +83,11 @@ -NODE2COORDINATES_EMPTY = pd.DataFrame( - data=None, - columns=BED4_COLUMNS -)\ -.astype(BED4_DTYPES)\ -.set_index("name") +NODE2COORDINATES_EMPTY = BED4_EMPTY.copy().set_index("name") -NODE2COORDINATES_SIMPLE = pd.DataFrame( - data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], - columns=BED4_COLUMNS -).set_index("name") +NODE2COORDINATES_SIMPLE = BED4_SIMPLE.copy().set_index("name") -NODE2COORDINATES_COMPLEX = pd.DataFrame( - data=[ - ["ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326"], - ["ENSDART00000161035.1", 397, 472, "ENSDART00000161035.1:397-472"], - ["ENSDART00000161035.1", 477, 523, "ENSDART00000161035.1:477-523"], - ["ENSDART00000165342.1", 5, 127, "ENSDART00000165342.1:5-127"], - ["ENSDART00000165342.1", 125, 304, "ENSDART00000165342.1:125-304"], - ["ENSDART00000165342.1", 317, 460, "ENSDART00000165342.1:317-460"], - ["ENSDART00000165342.1", 459, 592, "ENSDART00000165342.1:459-592"], - ["ENSDART00000165342.1", 591, 650, "ENSDART00000165342.1:591-650"], - ["ENSDART00000165342.1", 645, 746, "ENSDART00000165342.1:645-746"], - ["ENSDART00000165342.1", 746, 851, "ENSDART00000165342.1:746-851"], - ["ENSDART00000165342.1", 854, 886, "ENSDART00000165342.1:854-886"], - ["ENSDART00000165342.1", 899, 953, "ENSDART00000165342.1:899-953"], - ["ENSDART00000165342.1", 974, 1097, "ENSDART00000165342.1:974-1097"], - ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], - ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] - ], - columns=BED4_COLUMNS -).set_index("name") +NODE2COORDINATES_COMPLEX = BED4_COMPLEX.copy().set_index("name") @@ -421,25 +394,33 @@ NODE2SEQUENCE_COMPLEX_SOFT = pd.DataFrame( data=[[ 'ENSDART00000161035.1:397-472', - 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA' + 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT' + 'CAACA' ], [ 'ENSDART00000165342.1:125-304', - 'agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' + 'agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA' + 'ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC' + 'AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' ], [ 'ENSDART00000165342.1:317-460', - 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg' + 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG' + 'GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC' + 'CAg' ], [ 'ENSDART00000165342.1:459-592', - 'gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa' + 'gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG' + 'CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa' ], [ 'ENSDART00000165342.1:591-650', 'aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct' ], [ 'ENSDART00000165342.1:645-746', - 'gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT' + 'gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG' + 'AGTCTGCTGGAGGAATCAGTGTATCCACGCT' ], [ 'ENSDART00000165342.1:746-851', - 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' + 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT' + 'GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' ], [ 'ENSDART00000165342.1:854-886', 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' @@ -448,10 +429,12 @@ 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' ], [ 'ENSDART00000165342.1:974-1097', - 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' + 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA' + 'GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' ], [ 'ENSDART00000165342.1:1098-1175', - 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA' + 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT' + 'ATCCTGA' ]], columns=['name', 'sequence'] ) @@ -459,25 +442,33 @@ NODE2SEQUENCE_COMPLEX_HARD = pd.DataFrame( data=[[ 'ENSDART00000161035.1:397-472', - 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA' + 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT' + 'CAACA' ], [ 'ENSDART00000165342.1:125-304', - 'NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' + 'NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA' + 'ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC' + 'AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' ], [ 'ENSDART00000165342.1:317-460', - 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN' + 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG' + 'GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC' + 'CAN' ], [ 'ENSDART00000165342.1:459-592', - 'NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN' + 'NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG' + 'CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN' ], [ 'ENSDART00000165342.1:591-650', 'NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN' ], [ 'ENSDART00000165342.1:645-746', - 'NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT' + 'NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG' + 'AGTCTGCTGGAGGAATCAGTGTATCCACGCT' ], [ 'ENSDART00000165342.1:746-851', - 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' + 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT' + 'GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' ], [ 'ENSDART00000165342.1:854-886', 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' @@ -486,10 +477,12 @@ 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' ], [ 'ENSDART00000165342.1:974-1097', - 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' + 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA' + 'GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' ], [ 'ENSDART00000165342.1:1098-1175', - 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA' + 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT' + 'ATCCTGA' ]], columns=["name", "sequence"] From 0289afbd5d4aebbd887fce1c454d821863db0686 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 17 Jan 2019 11:33:18 +0100 Subject: [PATCH 31/45] Bugfix: correct didn't remove tmp files --- exfi/correct.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/exfi/correct.py b/exfi/correct.py index a20e3da..a391627 100755 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -154,7 +154,9 @@ def correct_bed4(bed4, transcriptome_dict, args): sealer_input_fn = prepare_sealer( bed4=bed4, transcriptome_dict=transcriptome_dict, args=args ) - output_sealer_fn = run_sealer(sealer_input_fn=sealer_input_fn, args=args) - sealer_results = collect_sealer_results(filename=output_sealer_fn) + sealer_output_fn = run_sealer(sealer_input_fn=sealer_input_fn, args=args) + sealer_results = collect_sealer_results(filename=sealer_output_fn) bed4_corrected = apply_correction_to_bed4(bed4, sealer_results) + os.remove(sealer_input_fn) + os.remove(sealer_output_fn) return bed4_corrected From ceacc4fea47cb7d96385059745e827122f6f772b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Wed, 23 Jan 2019 17:43:16 +0100 Subject: [PATCH 32/45] Disabled pandas' SettingWithCopyWarning --- exfi/correct.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/exfi/correct.py b/exfi/correct.py index a391627..511ed0a 100755 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -32,12 +32,16 @@ def prepare_sealer(bed4, transcriptome_dict, args): edge2overlap = bed4_to_edge2overlap(bed4) node2sequence_dict = node2sequence.set_index("name").to_dict()["sequence"] + # Disable warnings + pd.options.mode.chained_assignment = None + + # Compute the small gaps small_gaps = edge2overlap\ .loc[(edge2overlap.overlap < 0) & (edge2overlap.overlap <= max_gap_size)] - small_gaps["data_to_map"] = tuple(zip(small_gaps.u, small_gaps.v)) + small_gaps["identifier"] = small_gaps['u'] + "~" + small_gaps['v'] - small_gaps["identifier"] = small_gaps.u + "~" + small_gaps.v + small_gaps["data_to_map"] = tuple(zip(small_gaps.u, small_gaps.v)) small_gaps["sequence"] = small_gaps.data_to_map\ .map( @@ -49,6 +53,7 @@ def prepare_sealer(bed4, transcriptome_dict, args): small_gaps = small_gaps[["identifier", "sequence"]] + # Compute pairs of overlapping exons overlaps = edge2overlap.loc[edge2overlap.overlap >= 0] overlaps["data_to_map"] = tuple(zip(overlaps.u, overlaps.v, overlaps.overlap)) overlaps["identifier"] = overlaps.u + "~" + overlaps.v @@ -61,6 +66,10 @@ def prepare_sealer(bed4, transcriptome_dict, args): ) overlaps = overlaps[["identifier", "sequence"]] + # Put again the warning + pd.options.mode.chained_assignment = 'warn' + + # Merge the results for_sealer = pd.concat([small_gaps, overlaps]) for_sealer["fasta"] = ">" + for_sealer["identifier"] + "\n" + for_sealer["sequence"] + "\n" for_sealer = for_sealer[["fasta"]] From 33962a5709c75ee553ad89f47aa508e45259511b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 24 Jan 2019 15:26:07 +0100 Subject: [PATCH 33/45] Bugfix when masking --- exfi/io/masking.py | 71 +++++++---- tests/io/bed.py | 223 ++++++++++++++++++++-------------- tests/test_io/test_masking.py | 17 ++- 3 files changed, 193 insertions(+), 118 deletions(-) diff --git a/exfi/io/masking.py b/exfi/io/masking.py index c78ea2c..4e0469d 100644 --- a/exfi/io/masking.py +++ b/exfi/io/masking.py @@ -6,7 +6,20 @@ import logging -import pandas as pd +import numpy as np + +def cigar_to_int(cigar): + """Convert a simple CIGAR string to overlap int + + >>> cigar_to_int('71N') + -71 + >>> cigar_to_int('3M') + 3 + """ + if cigar[-1] == 'N': + return -int(cigar[:-1]) + return int(cigar[:-1]) + def soft_mask(sequence, left, right): """Lowercase the first left bases and last right bases of sequence @@ -26,7 +39,7 @@ def soft_mask(sequence, left, right): return sequence[:-right] + sequence[-right:].lower() if left > 0 and right == 0: return sequence[:left].lower() + sequence[left:] - return sequence[:left].lower() + sequence[left:-right] + sequence[-right:].lower() + return sequence[:left].lower() + sequence[left:-right] + sequence[-right:].lower() @@ -63,32 +76,35 @@ def mask(node2sequence, edge2overlap, masking: str = "none"): if masking == 'none': return node2sequence - edge2overlap['tmp_overlap'] = edge2overlap.overlap.map( - lambda x: x if x > 0 else 0 - ) - - tmp = pd.merge( - node2sequence, - edge2overlap[['u', 'tmp_overlap']].rename( - columns={'u': 'name', 'tmp_overlap': 'mask_right'} - ), - on=['name'] - ) - - complete = complete = pd.merge( - tmp, - edge2overlap[['v', 'tmp_overlap']].rename( - columns={'v': 'name', 'tmp_overlap': 'mask_left'} - ), - on=['name'] - ) + # Compose a dataframe of name, sequence, bases to trim to the left + # and bases to trim to the right + + complete = node2sequence.merge( + edge2overlap[['u', 'overlap']]\ + .rename(columns={'u': 'name', 'overlap': 'mask_right'}), + on=['name'], + how='outer' + ).merge( + edge2overlap[['v', 'overlap']]\ + .rename(columns={'v': 'name', 'overlap': 'mask_left'}), + on=['name'], + how='outer' + )\ + .fillna(0)\ + .astype({'mask_right': np.int64, 'mask_left':np.int64}) + + # Set to zero overlaps < 0 + complete['mask_right'] = complete.mask_right\ + .map(lambda x: x if x > 0 else 0) + complete['mask_left'] = complete.mask_left\ + .map(lambda x: x if x > 0 else 0) complete['tmp'] = tuple(zip( - complete.sequence, complete.mask_left, complete.mask_right + complete.sequence, + complete.mask_left, + complete.mask_right )) - - if masking == "hard": logging.info("\tHard masking sequences") complete['sequence'] = complete.tmp.map(lambda x: hard_mask(*x)) @@ -96,5 +112,8 @@ def mask(node2sequence, edge2overlap, masking: str = "none"): logging.info("\tSoft masking sequences") complete['sequence'] = complete.tmp.map(lambda x: soft_mask(*x)) - exon_dict = complete[['name', 'sequence']].reset_index(drop=True) - return exon_dict + node2sequence_masked = complete\ + [['name', 'sequence']]\ + .reset_index(drop=True) + + return node2sequence_masked diff --git a/tests/io/bed.py b/tests/io/bed.py index 5433702..f1538d0 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -392,98 +392,141 @@ NODE2SEQUENCE_COMPLEX_SOFT = pd.DataFrame( - data=[[ - 'ENSDART00000161035.1:397-472', - 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT' - 'CAACA' - ], [ - 'ENSDART00000165342.1:125-304', - 'agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA' - 'ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC' - 'AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' - ], [ - 'ENSDART00000165342.1:317-460', - 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG' - 'GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC' - 'CAg' - ], [ - 'ENSDART00000165342.1:459-592', - 'gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG' - 'CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa' - ], [ - 'ENSDART00000165342.1:591-650', - 'aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct' - ], [ - 'ENSDART00000165342.1:645-746', - 'gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG' - 'AGTCTGCTGGAGGAATCAGTGTATCCACGCT' - ], [ - 'ENSDART00000165342.1:746-851', - 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT' - 'GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' - ], [ - 'ENSDART00000165342.1:854-886', - 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' - ], [ - 'ENSDART00000165342.1:899-953', - 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' - ], [ - 'ENSDART00000165342.1:974-1097', - 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA' - 'GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' - ], [ - 'ENSDART00000165342.1:1098-1175', - 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT' - 'ATCCTGA' - ]], - columns=['name', 'sequence'] + data=[ + [ + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTT" + "GATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAG" + "GGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCT" + "CTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCA" + "GCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" + ], [ + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCC" + "AAATCAACA" + ], [ + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" + ], [ + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAA" + "TCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag" + ], [ + "ENSDART00000165342.1:125-304", + "agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGA" + "CACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGC" + "TCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" + ], [ + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCC" + "TGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAA" + "GAAGTTACCAg" + ], [ + "ENSDART00000165342.1:459-592", + "gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTT" + "CTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTA" + "a" + ], [ + "ENSDART00000165342.1:591-650", + "aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct" + ], [ + "ENSDART00000165342.1:645-746", + "gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTT" + "GGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT" + ], [ + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTG" + "GGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" + ], [ + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + ], [ + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" + ], [ + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAG" + "TAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" + ], [ + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTC" + "ACATATCCTGA" + ], [ + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATT" + "GAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTG" + "GCAAAACACCACAGCG" + ] + ], + columns=["name", "sequence"] ) NODE2SEQUENCE_COMPLEX_HARD = pd.DataFrame( - data=[[ - 'ENSDART00000161035.1:397-472', - 'AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT' - 'CAACA' - ], [ - 'ENSDART00000165342.1:125-304', - 'NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA' - 'ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC' - 'AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA' - ], [ - 'ENSDART00000165342.1:317-460', - 'GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG' - 'GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC' - 'CAN' - ], [ - 'ENSDART00000165342.1:459-592', - 'NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG' - 'CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN' - ], [ - 'ENSDART00000165342.1:591-650', - 'NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN' - ], [ - 'ENSDART00000165342.1:645-746', - 'NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG' - 'AGTCTGCTGGAGGAATCAGTGTATCCACGCT' - ], [ - 'ENSDART00000165342.1:746-851', - 'GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT' - 'GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA' - ], [ - 'ENSDART00000165342.1:854-886', - 'TGCAGCCAAACAATGCAACTGTGACAGCAGCA' - ], [ - 'ENSDART00000165342.1:899-953', - 'TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA' - ], [ - 'ENSDART00000165342.1:974-1097', - 'CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA' - 'GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA' - ], [ - 'ENSDART00000165342.1:1098-1175', - 'TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT' - 'ATCCTGA' - ]], + data=[ + [ + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTT" + "GATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAG" + "GGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCT" + "CTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCA" + "GCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" + ], [ + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCC" + "AAATCAACA" + ], [ + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" + ], [ + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAA" + "TCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN" + ], [ + "ENSDART00000165342.1:125-304", + "NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGA" + "CACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGC" + "TCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" + ], [ + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCC" + "TGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAA" + "GAAGTTACCAN" + ], [ + "ENSDART00000165342.1:459-592", + "NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTT" + "CTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTA" + "N" + ], [ + "ENSDART00000165342.1:591-650", + "NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN" + ], [ + "ENSDART00000165342.1:645-746", + "NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTT" + "GGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT" + ], [ + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTG" + "GGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" + ], [ + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + ], [ + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" + ], [ + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAG" + "TAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" + ], [ + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTC" + "ACATATCCTGA" + ], [ + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATT" + "GAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTG" + "GCAAAACACCACAGCG" + ] + ], columns=["name", "sequence"] - ) diff --git a/tests/test_io/test_masking.py b/tests/test_io/test_masking.py index 5dd7c30..935aa9f 100644 --- a/tests/test_io/test_masking.py +++ b/tests/test_io/test_masking.py @@ -21,18 +21,31 @@ class TestMask(TestCase): def test_no_mask(self): """exfi.io.masking._mask: no masking""" observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "none") + print( + "Observed:", observed, "", + "Expected:", NODE2SEQUENCE_COMPLEX, + sep="\n" + ) self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX)) def test_soft_mask(self): """exfi.io.masking._mask: soft masking""" observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "soft") - print(observed.values.tolist()) + print( + "Observed:", observed, "", + "Expected:", NODE2SEQUENCE_COMPLEX_SOFT, + sep="\n" + ) self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX_SOFT)) def test_hard_mask(self): """exfi.io.masking._mask: hard masking""" observed = mask(NODE2SEQUENCE_COMPLEX, EDGE2OVERLAP_COMPLEX, "hard") - print(observed.values.tolist()) + print( + "Observed:", observed, "", + "Expected:", NODE2SEQUENCE_COMPLEX_HARD, + sep="\n" + ) self.assertTrue(observed.equals(NODE2SEQUENCE_COMPLEX_HARD)) From d727e94055275ac3b5bc8d8f91c81bab35e3781c Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 24 Jan 2019 15:54:11 +0100 Subject: [PATCH 34/45] Masking in gfa1 to fasta --- exfi/io/gfa1_to_fasta.py | 103 ++++++++++++++++++++-------- tests/io/gapped_complex_hard.fa | 53 +------------- tests/test_io/test_gfa1_to_fasta.py | 60 ++++++++++++---- 3 files changed, 120 insertions(+), 96 deletions(-) diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index dccbee9..590c919 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -5,26 +5,52 @@ import pandas as pd -def gfa1_to_exons(fasta_out, gfa1_in): +from exfi.io.masking import mask, cigar_to_int + +def gfa1_to_exons(fasta_out, gfa1_in, masking='none'): """Extract the exons in Fasta format""" with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: - segments = pd.DataFrame( - data=[ - x.strip().split("\t")[0:3] - for x in gfa.readlines() if x[0] == "S" - ], - columns=["RecordType", "Name", "Sequence"], - ) - if segments.shape[0] == 0: + data = [ + x.strip().split("\t") + for x in gfa.readlines() if x[0] in set(["S", "L"]) + ] + + if not data: return - segments["fasta"] = ">" + segments["Name"] + "\n" + segments["Sequence"] - segments.fasta.values.tofile(fasta, sep="\n", format="%s") + node2sequence = pd.DataFrame( + data=[x[0:3] for x in data if x[0] == "S"], + columns=["RecordType", "name", "sequence"] + ).drop(columns="RecordType") + + if node2sequence.shape[0] == 0: + return + + edge2overlap = pd.DataFrame( + data=[x[0:6] for x in data if x[0] == 'L'], + columns=["RecordType", "u", "FromOrient", "v", "ToOrient", + "OverlapCigar"] + ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) + edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) + + node2sequence = mask( + node2sequence=node2sequence, + edge2overlap=edge2overlap, + masking=masking + ) + + node2sequence["fasta"] = \ + ">" + node2sequence["name"] + "\n" + \ + node2sequence["sequence"] + + node2sequence.fasta.values.tofile(fasta, sep="\n", format="%s") fasta.write("\n") # Final end line -def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): + +def gfa1_to_gapped_transcripts( + fasta_out, gfa1_in, gap_size=100, masking='none'): """Convert a GFA1 file to a gapped transcript file""" with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: @@ -34,43 +60,60 @@ def gfa1_to_gapped_transcripts(fasta_out, gfa1_in, gap_size=100): # Read only segments and paths data = [ x.strip().split("\t") - for x in gfa.readlines() if x[0] in set(["S", "P"]) + for x in gfa.readlines() if x[0] in set(["S", "P", "L"]) ] if not data: return - # Create {node_id: nucleotide} + # Segments -> node2sequence node2sequence = pd.DataFrame( data=[x[0:3] for x in data if x[0] == "S"], - columns=["RecordType", "Name", "Sequence"], - )\ - .drop(columns="RecordType")\ - .set_index("Name")\ - .to_dict()["Sequence"] - - # Get the path info - paths = pd.DataFrame( + columns=["RecordType", "name", "sequence"], + )\ + .drop(columns="RecordType") + + # Links -> edge2overlap + edge2overlap = pd.DataFrame( + data=[x[0:6] for x in data if x[0] == 'L'], + columns=["RecordType", "u", "FromOrient", "v", "ToOrient", + "OverlapCigar"] + ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) + edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) + + # Paths -> path2nodes + path2nodes = pd.DataFrame( data=[x[0:4] for x in data if x[0] == "P"], columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] )\ .drop(columns=["RecordType", "Overlaps"]) + path2nodes["SegmentNames"] = path2nodes["SegmentNames"]\ + .str.replace("+", "") + del data - paths["SegmentNames"] = paths["SegmentNames"].str.replace("+", "") + # Mask the sequences + node2sequence = mask( + node2sequence=node2sequence, + edge2overlap=edge2overlap, + masking=masking + ) + + node2sequence_dict = node2sequence\ + .set_index('name')\ + .to_dict()['sequence'] # Compose the sequence - paths["gapped_sequence"] = paths\ + path2nodes["gapped_sequence"] = path2nodes\ .SegmentNames\ .str.split(',')\ - .map(lambda x: separator.join([node2sequence[y] for y in x])) - del node2sequence + .map(lambda x: separator.join([node2sequence_dict[y] for y in x])) # Create the fasta line - paths["fasta"] = \ - ">" + paths.PathName + " " + paths.SegmentNames + "\n" + \ - paths.gapped_sequence + path2nodes["fasta"] = \ + ">" + path2nodes.PathName + " " + path2nodes.SegmentNames + "\n" + \ + path2nodes.gapped_sequence # Dump everything - paths.fasta.values.tofile(fasta, sep="\n", format="%s") + path2nodes.fasta.values.tofile(fasta, sep="\n", format="%s") fasta.write("\n") # Final end line diff --git a/tests/io/gapped_complex_hard.fa b/tests/io/gapped_complex_hard.fa index 5dfe077..72c709c 100644 --- a/tests/io/gapped_complex_hard.fa +++ b/tests/io/gapped_complex_hard.fa @@ -1,53 +1,4 @@ >ENSDART00000161035.1 ENSDART00000161035.1:0-326,ENSDART00000161035.1:397-472,ENSDART00000161035.1:477-523 -TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTA -CATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACA -GCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCT -CTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCG -TCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGG -ATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCT -GCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA >ENSDART00000165342.1 ENSDART00000165342.1:5-127,ENSDART00000165342.1:125-304,ENSDART00000165342.1:317-460,ENSDART00000165342.1:459-592,ENSDART00000165342.1:591-650,ENSDART00000165342.1:645-746,ENSDART00000165342.1:746-851,ENSDART00000165342.1:854-886,ENSDART00000165342.1:899-953,ENSDART00000165342.1:974-1097,ENSDART00000165342.1:1098-1175,ENSDART00000165342.1:1176-1324 -TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAA -GGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGA -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGACCTGTAGTAGAAAC -AAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAA -TAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACC -ACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGA -GTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACAT -CTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCANNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTT -GGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTG -CTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTANNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGG -ATCAAGTAGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCT -TTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATT -GCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCC -CANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAA -CTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGA -CTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAAC -CCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTA -ACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCG -CTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -NNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACA -CACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTA -ACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG +TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCCAAACAATGCAACTGTGACAGCAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG diff --git a/tests/test_io/test_gfa1_to_fasta.py b/tests/test_io/test_gfa1_to_fasta.py index 6f06459..5ab77d8 100644 --- a/tests/test_io/test_gfa1_to_fasta.py +++ b/tests/test_io/test_gfa1_to_fasta.py @@ -17,7 +17,9 @@ from tests.io.fasta import \ EXONS_EMPTY_FN, EXONS_SIMPLE_FN, EXONS_COMPLEX_FN, \ - GAPPED_EMPTY_FN, GAPPED_SIMPLE_FN, GAPPED_COMPLEX_FN + EXONS_COMPLEX_SOFT_FN, EXONS_COMPLEX_HARD_FN, \ + GAPPED_EMPTY_FN, GAPPED_SIMPLE_FN, GAPPED_COMPLEX_FN, \ + GAPPED_COMPLEX_SOFT_FN, GAPPED_COMPLEX_HARD_FN class TestGFA1ToExons(TestCase): @@ -47,13 +49,29 @@ def test_complex(self): self.assertTrue(filecmp.cmp(tmp_file, EXONS_COMPLEX_FN)) os.remove(tmp_file) - # def test_soft_masking(self): - # """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and soft masking case""" - # pass - # - # def test_complex(self): - # """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and hard masking case""" - # pass + def test_soft_masking(self): + """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and soft masking case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_exons( + fasta_out=tmp_file, + gfa1_in=GFA1_COMPLEX_FN, + masking='soft' + ) + self.assertTrue(filecmp.cmp(tmp_file, EXONS_COMPLEX_SOFT_FN)) + os.remove(tmp_file) + + def test_hard_masking(self): + """exfi.io.gfa1_to_fasta.gfa1_to_exons: complex and hard masking case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_exons( + fasta_out=tmp_file, + gfa1_in=GFA1_COMPLEX_FN, + masking='hard' + ) + self.assertTrue(filecmp.cmp(tmp_file, EXONS_COMPLEX_HARD_FN)) + os.remove(tmp_file) @@ -84,13 +102,25 @@ def test_complex(self): self.assertTrue(filecmp.cmp(tmp_file, GAPPED_COMPLEX_FN)) os.remove(tmp_file) - # def test_soft_masking(self): - # """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and soft masking case""" - # pass - # - # def test_complex(self): - # """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and hard masking case""" - # pass + def test_soft_masking(self): + """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and soft masking case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_gapped_transcripts( + fasta_out=tmp_file, gfa1_in=GFA1_COMPLEX_FN, masking='soft' + ) + self.assertTrue(filecmp.cmp(tmp_file, GAPPED_COMPLEX_SOFT_FN)) + os.remove(tmp_file) + + def test_hard_masking(self): + """exfi.io.gfa1_to_fasta.gfa1_to_gapped_transcripts: complex and hard masking case""" + tmp_file = mkstemp()[1] + print(tmp_file) + gfa1_to_gapped_transcripts( + fasta_out=tmp_file, gfa1_in=GFA1_COMPLEX_FN, masking='hard' + ) + self.assertTrue(filecmp.cmp(tmp_file, GAPPED_COMPLEX_HARD_FN)) + os.remove(tmp_file) From 7663bc3f22735c6af77f8e603c87622950500ea3 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 24 Jan 2019 16:36:31 +0100 Subject: [PATCH 35/45] Masking GFA1 files + tests --- exfi/io/bed4_to_gfa1.py | 34 ++++++++++++++++------ tests/io/complex_hard.gfa | 46 ++++++++++++++++++++++++++++++ tests/io/complex_soft.gfa | 46 ++++++++++++++++++++++++++++++ tests/io/gfa1.py | 2 ++ tests/test_io/test_bed4_to_gfa1.py | 30 ++++++++++++++++++- 5 files changed, 149 insertions(+), 9 deletions(-) create mode 100644 tests/io/complex_hard.gfa create mode 100644 tests/io/complex_soft.gfa diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 4427ec0..2e09fa2 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -9,6 +9,9 @@ bed4_to_node2sequence, \ bed4_to_edge2overlap +from exfi.io.masking import \ + mask + def compute_header(): """Write GFA1 header""" header = pd.DataFrame( @@ -18,15 +21,28 @@ def compute_header(): return header -def compute_segments(bed4, transcriptome_dict): +def compute_segments(bed4, transcriptome_dict, masking='none'): """Create the Segments subdataframe for GFA1 file""" - segments = bed4_to_node2sequence(bed4=bed4, transcriptome_dict=transcriptome_dict) + segments = bed4_to_node2sequence( + bed4=bed4, transcriptome_dict=transcriptome_dict + ) + edge2overlap = bed4_to_edge2overlap(bed4) + segments = mask( + node2sequence=segments, edge2overlap=edge2overlap, masking=masking + ) + del edge2overlap + # Add the S and length columns segments["RecordType"] = "S" - segments["SegmentLength"] = segments.sequence.map(lambda x: "LN:i:" + str(len(x))) + + # Compute lengths + segments["SegmentLength"] = segments\ + .sequence.map(lambda x: "LN:i:" + str(len(x))) + # reorder segments = segments\ [["RecordType", "name", "sequence", "SegmentLength"]] + return segments @@ -74,17 +90,19 @@ def compute_paths(bed4): return paths -def bed4_to_gfa1(gfa1_fn, bed4, transcriptome_dict): +def bed4_to_gfa1(gfa1_fn, bed4, transcriptome_dict, masking='none'): """Convert the BED4 dataframe into a GFA1 file""" with open(gfa1_fn, "w", 1024**3) as gfa: compute_header()\ .to_csv(gfa, sep="\t", header=False, index=False) with open(gfa1_fn, "a", 1024**3) as gfa: - compute_segments(bed4, transcriptome_dict)\ + compute_segments( + bed4=bed4, transcriptome_dict=transcriptome_dict, masking=masking + )\ .to_csv(gfa, sep="\t", header=False, index=False) - compute_links(bed4)\ + compute_links(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) - compute_containments(bed4)\ + compute_containments(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) - compute_paths(bed4)\ + compute_paths(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) diff --git a/tests/io/complex_hard.gfa b/tests/io/complex_hard.gfa new file mode 100644 index 0000000..77cdf64 --- /dev/null +++ b/tests/io/complex_hard.gfa @@ -0,0 +1,46 @@ +H VN:Z:1.0 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 +S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA LN:i:75 +S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA LN:i:46 +S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN LN:i:122 +S ENSDART00000165342.1:125-304 NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA LN:i:179 +S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN LN:i:143 +S ENSDART00000165342.1:459-592 NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN LN:i:133 +S ENSDART00000165342.1:591-650 NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN LN:i:59 +S ENSDART00000165342.1:645-746 NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT LN:i:101 +S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA LN:i:105 +S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA LN:i:32 +S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA LN:i:54 +S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 +S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 +S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 +L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N +L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N +L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M +L ENSDART00000165342.1:125-304 + ENSDART00000165342.1:317-460 + 13N +L ENSDART00000165342.1:317-460 + ENSDART00000165342.1:459-592 + 1M +L ENSDART00000165342.1:459-592 + ENSDART00000165342.1:591-650 + 1M +L ENSDART00000165342.1:591-650 + ENSDART00000165342.1:645-746 + 5M +L ENSDART00000165342.1:645-746 + ENSDART00000165342.1:746-851 + 0M +L ENSDART00000165342.1:746-851 + ENSDART00000165342.1:854-886 + 3N +L ENSDART00000165342.1:854-886 + ENSDART00000165342.1:899-953 + 13N +L ENSDART00000165342.1:899-953 + ENSDART00000165342.1:974-1097 + 21N +L ENSDART00000165342.1:974-1097 + ENSDART00000165342.1:1098-1175 + 1N +L ENSDART00000165342.1:1098-1175 + ENSDART00000165342.1:1176-1324 + 1N +C ENSDART00000161035.1 + ENSDART00000161035.1:0-326 + 0 326M +C ENSDART00000161035.1 + ENSDART00000161035.1:397-472 + 397 75M +C ENSDART00000161035.1 + ENSDART00000161035.1:477-523 + 477 46M +C ENSDART00000165342.1 + ENSDART00000165342.1:5-127 + 5 122M +C ENSDART00000165342.1 + ENSDART00000165342.1:125-304 + 125 179M +C ENSDART00000165342.1 + ENSDART00000165342.1:317-460 + 317 143M +C ENSDART00000165342.1 + ENSDART00000165342.1:459-592 + 459 133M +C ENSDART00000165342.1 + ENSDART00000165342.1:591-650 + 591 59M +C ENSDART00000165342.1 + ENSDART00000165342.1:645-746 + 645 101M +C ENSDART00000165342.1 + ENSDART00000165342.1:746-851 + 746 105M +C ENSDART00000165342.1 + ENSDART00000165342.1:854-886 + 854 32M +C ENSDART00000165342.1 + ENSDART00000165342.1:899-953 + 899 54M +C ENSDART00000165342.1 + ENSDART00000165342.1:974-1097 + 974 123M +C ENSDART00000165342.1 + ENSDART00000165342.1:1098-1175 + 1098 77M +C ENSDART00000165342.1 + ENSDART00000165342.1:1176-1324 + 1176 148M +P ENSDART00000161035.1 ENSDART00000161035.1:0-326+,ENSDART00000161035.1:397-472+,ENSDART00000161035.1:477-523+ * +P ENSDART00000165342.1 ENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+,ENSDART00000165342.1:317-460+,ENSDART00000165342.1:459-592+,ENSDART00000165342.1:591-650+,ENSDART00000165342.1:645-746+,ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+,ENSDART00000165342.1:899-953+,ENSDART00000165342.1:974-1097+,ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+ * diff --git a/tests/io/complex_soft.gfa b/tests/io/complex_soft.gfa new file mode 100644 index 0000000..a36b20d --- /dev/null +++ b/tests/io/complex_soft.gfa @@ -0,0 +1,46 @@ +H VN:Z:1.0 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 +S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA LN:i:75 +S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA LN:i:46 +S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag LN:i:122 +S ENSDART00000165342.1:125-304 agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA LN:i:179 +S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg LN:i:143 +S ENSDART00000165342.1:459-592 gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa LN:i:133 +S ENSDART00000165342.1:591-650 aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct LN:i:59 +S ENSDART00000165342.1:645-746 gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT LN:i:101 +S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA LN:i:105 +S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA LN:i:32 +S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA LN:i:54 +S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 +S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 +S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 +L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N +L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N +L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M +L ENSDART00000165342.1:125-304 + ENSDART00000165342.1:317-460 + 13N +L ENSDART00000165342.1:317-460 + ENSDART00000165342.1:459-592 + 1M +L ENSDART00000165342.1:459-592 + ENSDART00000165342.1:591-650 + 1M +L ENSDART00000165342.1:591-650 + ENSDART00000165342.1:645-746 + 5M +L ENSDART00000165342.1:645-746 + ENSDART00000165342.1:746-851 + 0M +L ENSDART00000165342.1:746-851 + ENSDART00000165342.1:854-886 + 3N +L ENSDART00000165342.1:854-886 + ENSDART00000165342.1:899-953 + 13N +L ENSDART00000165342.1:899-953 + ENSDART00000165342.1:974-1097 + 21N +L ENSDART00000165342.1:974-1097 + ENSDART00000165342.1:1098-1175 + 1N +L ENSDART00000165342.1:1098-1175 + ENSDART00000165342.1:1176-1324 + 1N +C ENSDART00000161035.1 + ENSDART00000161035.1:0-326 + 0 326M +C ENSDART00000161035.1 + ENSDART00000161035.1:397-472 + 397 75M +C ENSDART00000161035.1 + ENSDART00000161035.1:477-523 + 477 46M +C ENSDART00000165342.1 + ENSDART00000165342.1:5-127 + 5 122M +C ENSDART00000165342.1 + ENSDART00000165342.1:125-304 + 125 179M +C ENSDART00000165342.1 + ENSDART00000165342.1:317-460 + 317 143M +C ENSDART00000165342.1 + ENSDART00000165342.1:459-592 + 459 133M +C ENSDART00000165342.1 + ENSDART00000165342.1:591-650 + 591 59M +C ENSDART00000165342.1 + ENSDART00000165342.1:645-746 + 645 101M +C ENSDART00000165342.1 + ENSDART00000165342.1:746-851 + 746 105M +C ENSDART00000165342.1 + ENSDART00000165342.1:854-886 + 854 32M +C ENSDART00000165342.1 + ENSDART00000165342.1:899-953 + 899 54M +C ENSDART00000165342.1 + ENSDART00000165342.1:974-1097 + 974 123M +C ENSDART00000165342.1 + ENSDART00000165342.1:1098-1175 + 1098 77M +C ENSDART00000165342.1 + ENSDART00000165342.1:1176-1324 + 1176 148M +P ENSDART00000161035.1 ENSDART00000161035.1:0-326+,ENSDART00000161035.1:397-472+,ENSDART00000161035.1:477-523+ * +P ENSDART00000165342.1 ENSDART00000165342.1:5-127+,ENSDART00000165342.1:125-304+,ENSDART00000165342.1:317-460+,ENSDART00000165342.1:459-592+,ENSDART00000165342.1:591-650+,ENSDART00000165342.1:645-746+,ENSDART00000165342.1:746-851+,ENSDART00000165342.1:854-886+,ENSDART00000165342.1:899-953+,ENSDART00000165342.1:974-1097+,ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+ * diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index f8139c9..2b760dd 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -273,3 +273,5 @@ GFA1_EMPTY_FN = "tests/io/empty.gfa" GFA1_SIMPLE_FN = "tests/io/simple.gfa" GFA1_COMPLEX_FN = "tests/io/complex.gfa" +GFA1_COMPLEX_SOFT_FN = "tests/io/complex_soft.gfa" +GFA1_COMPLEX_HARD_FN = "tests/io/complex_hard.gfa" diff --git a/tests/test_io/test_bed4_to_gfa1.py b/tests/test_io/test_bed4_to_gfa1.py index 0e0359c..3393b5f 100644 --- a/tests/test_io/test_bed4_to_gfa1.py +++ b/tests/test_io/test_bed4_to_gfa1.py @@ -29,7 +29,8 @@ LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX, \ - GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN + GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN, \ + GFA1_COMPLEX_SOFT_FN, GFA1_COMPLEX_HARD_FN @@ -173,6 +174,33 @@ def test_complex(self): self.assertTrue(filecmp.cmp(tmp_file, GFA1_COMPLEX_FN)) os.remove(tmp_file) + def test_complex_soft(self): + """exfi.io.bed4_to_gfa1.bed4_to_gfa1: complex soft masked case""" + tmp_file = mkstemp()[1] + print(tmp_file) + bed4_to_gfa1( + gfa1_fn=tmp_file, + bed4=BED4_COMPLEX, + transcriptome_dict=TRANSCRIPTOME_COMPLEX_DICT, + masking='soft' + ) + self.assertTrue(filecmp.cmp(tmp_file, GFA1_COMPLEX_SOFT_FN)) + os.remove(tmp_file) + + def test_complex_hard(self): + """exfi.io.bed4_to_gfa1.bed4_to_gfa1: complex hard masked case""" + tmp_file = mkstemp()[1] + print(tmp_file) + bed4_to_gfa1( + gfa1_fn=tmp_file, + bed4=BED4_COMPLEX, + transcriptome_dict=TRANSCRIPTOME_COMPLEX_DICT, + masking='hard' + ) + self.assertTrue(filecmp.cmp(tmp_file, GFA1_COMPLEX_HARD_FN)) + os.remove(tmp_file) + + if __name__ == '__main__': From 352585462e4478d648da6441c3cf25821ced8c31 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Thu, 24 Jan 2019 16:46:28 +0100 Subject: [PATCH 36/45] More tests in bed4_to_gfa1 --- tests/io/gfa1.py | 183 +++++++++++++++++++++++++++++ tests/test_io/test_bed4_to_gfa1.py | 16 +++ 2 files changed, 199 insertions(+) diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index 2b760dd..c6fe705 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -121,6 +121,189 @@ columns=["RecordType", "name", "sequence", "SegmentLength"] ) +SEGMENTS_COMPLEX_SOFT = pd.DataFrame( + data=[[ + "S", + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", + "LN:i:326", + ], [ + "S", + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" + "CAACA", + "LN:i:75", + ], [ + "S", + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + ], [ + "S", + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag", "LN:i:122" + ], [ + "S", + "ENSDART00000165342.1:125-304", + "agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" + "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + ], [ + "S", + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" + "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" + "CAg", "LN:i:143" + ], [ + "S", + "ENSDART00000165342.1:459-592", + "gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa", + "LN:i:133" + ], [ + "S", + "ENSDART00000165342.1:591-650", + "aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct", + "LN:i:59" + ], [ + "S", + "ENSDART00000165342.1:645-746", + "gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", + "LN:i:101", + ], [ + "S", + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + ], [ + "S", + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + "LN:i:32" + ], [ + "S", + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + ], [ + "S", + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", + "LN:i:123" + ], [ + "S", + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" + "ATCCTGA", + "LN:i:77", + ], [ + "S", + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" + "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" + "CCACAGCG", + "LN:i:148" + ]], + columns=["RecordType", "name", "sequence", "SegmentLength"] +) + +SEGMENTS_COMPLEX_HARD = pd.DataFrame( + data=[[ + "S", + "ENSDART00000161035.1:0-326", + "TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATT" + "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" + "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" + "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", + "LN:i:326", + ], [ + "S", + "ENSDART00000161035.1:397-472", + "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" + "CAACA", + "LN:i:75", + ], [ + "S", + "ENSDART00000161035.1:477-523", + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + ], [ + "S", + "ENSDART00000165342.1:5-127", + "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN", "LN:i:122" + ], [ + "S", + "ENSDART00000165342.1:125-304", + "NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" + "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + ], [ + "S", + "ENSDART00000165342.1:317-460", + "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" + "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" + "CAN", "LN:i:143" + ], [ + "S", + "ENSDART00000165342.1:459-592", + "NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN", + "LN:i:133" + ], [ + "S", + "ENSDART00000165342.1:591-650", + "NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN", + "LN:i:59" + ], [ + "S", + "ENSDART00000165342.1:645-746", + "NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", + "LN:i:101", + ], [ + "S", + "ENSDART00000165342.1:746-851", + "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + ], [ + "S", + "ENSDART00000165342.1:854-886", + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", + "LN:i:32" + ], [ + "S", + "ENSDART00000165342.1:899-953", + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + ], [ + "S", + "ENSDART00000165342.1:974-1097", + "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", + "LN:i:123" + ], [ + "S", + "ENSDART00000165342.1:1098-1175", + "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" + "ATCCTGA", + "LN:i:77", + ], [ + "S", + "ENSDART00000165342.1:1176-1324", + "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" + "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" + "CCACAGCG", + "LN:i:148" + ]], + columns=["RecordType", "name", "sequence", "SegmentLength"] +) + + LINKS_EMPTY = pd.DataFrame( diff --git a/tests/test_io/test_bed4_to_gfa1.py b/tests/test_io/test_bed4_to_gfa1.py index 3393b5f..82c2da1 100644 --- a/tests/test_io/test_bed4_to_gfa1.py +++ b/tests/test_io/test_bed4_to_gfa1.py @@ -26,6 +26,7 @@ from tests.io.gfa1 import \ HEADER, \ SEGMENTS_EMPTY, SEGMENTS_SIMPLE, SEGMENTS_COMPLEX, \ + SEGMENTS_COMPLEX_SOFT, SEGMENTS_COMPLEX_HARD, \ LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX, \ @@ -62,6 +63,21 @@ def test_complex(self): observed = compute_segments(BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) self.assertTrue(observed.equals(SEGMENTS_COMPLEX)) + def test_complex_simple(self): + """exfi.io.bed4_to_gfa1.compute_segments: complex case""" + observed = compute_segments( + BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT, 'soft' + ) + self.assertTrue(observed.equals(SEGMENTS_COMPLEX_SOFT)) + + + def test_complex_hard(self): + """exfi.io.bed4_to_gfa1.compute_segments: complex case""" + observed = compute_segments( + BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT, 'hard' + ) + self.assertTrue(observed.equals(SEGMENTS_COMPLEX_HARD)) + class TestComputeLinks(TestCase): """Tests for exfi.io.bed4_to_gfa1.compute_links""" From 347054ff5e74daf6b78c45339f5ffe8fbe63dd7b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 11:46:03 +0100 Subject: [PATCH 37/45] Universal GFA1 reader + tests and gfa variables --- exfi/io/bed4_to_gfa1.py | 17 ++++--- exfi/io/gfa1.py | 12 +++++ exfi/io/read_gfa.py | 52 ++++++++++++++++++++++ tests/io/gfa1.py | 37 ++++++++-------- tests/test_io/test_bed4_to_gfa1.py | 7 +++ tests/test_io/test_read_gfa.py | 71 ++++++++++++++++++++++++++++++ 6 files changed, 172 insertions(+), 24 deletions(-) create mode 100644 exfi/io/gfa1.py create mode 100644 exfi/io/read_gfa.py create mode 100644 tests/test_io/test_read_gfa.py diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 2e09fa2..5c24e88 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -9,6 +9,9 @@ bed4_to_node2sequence, \ bed4_to_edge2overlap +from exfi.io.gfa1 import \ + HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS + from exfi.io.masking import \ mask @@ -16,7 +19,7 @@ def compute_header(): """Write GFA1 header""" header = pd.DataFrame( data=[["H", "VN:Z:1.0"]], - columns=["RecordType", "Version"] + columns=HEADER_COLS ) return header @@ -32,16 +35,18 @@ def compute_segments(bed4, transcriptome_dict, masking='none'): ) del edge2overlap + # Add the S and length columns segments["RecordType"] = "S" # Compute lengths - segments["SegmentLength"] = segments\ + segments["Length"] = segments\ .sequence.map(lambda x: "LN:i:" + str(len(x))) # reorder segments = segments\ - [["RecordType", "name", "sequence", "SegmentLength"]] + .rename(columns={'name': 'Name', 'sequence': 'Sequence'})\ + [SEGMENT_COLS] return segments @@ -54,7 +59,7 @@ def compute_links(bed4): links["FromOrient"] = "+" links["ToOrient"] = "+" links["Overlap"] = links.Overlap.map(lambda x: str(x) + "M" if x >= 0 else str(-x) + "N") - links = links[["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"]] + links = links[LINK_COLS] return links @@ -70,7 +75,7 @@ def compute_containments(bed4): containments["Overlap"] = containments["chromEnd"] - containments["chromStart"] containments["Overlap"] = containments.Overlap.map(lambda x: str(x) + "M") containments = containments.drop(["chrom", "chromStart", "chromEnd", "name"], axis=1) - return containments + return containments[CONTAINMENT_COLS] def compute_paths(bed4): @@ -86,7 +91,7 @@ def compute_paths(bed4): paths["RecordType"] = "P" paths = paths.rename({"chrom": "PathName", "name": "SegmentNames"}, axis=1) paths["Overlaps"] = "*" - paths = paths[["RecordType", "PathName", "SegmentNames", "Overlaps"]] + paths = paths[PATH_COLS] return paths diff --git a/exfi/io/gfa1.py b/exfi/io/gfa1.py new file mode 100644 index 0000000..17f1121 --- /dev/null +++ b/exfi/io/gfa1.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +"""exfi.io.gfa1.py: submodule for auxliary variables for gfa""" + +HEADER_COLS = ['RecordType', 'VersionNumber'] +SEGMENT_COLS = ['RecordType', "Name", "Sequence", 'Length'] +LINK_COLS = ['RecordType', "From", "FromOrient", "To", "ToOrient", "Overlap"] +CONTAINMENT_COLS = [ + 'RecordType', 'Container', 'ContainerOrient', 'Contained', + 'ContainedOrient', 'Pos', 'Overlap' +] +PATH_COLS = ['RecordType', 'PathName', 'SegmentNames', 'Overlaps'] diff --git a/exfi/io/read_gfa.py b/exfi/io/read_gfa.py new file mode 100644 index 0000000..64a560f --- /dev/null +++ b/exfi/io/read_gfa.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +"""exfi.io.read_gfa.py: submodule to read GFA1 files""" + + +import pandas as pd +import numpy as np + +from exfi.io.gfa1 import \ + HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS + + +def read_gfa1(gfa1_fn): + """Read the GFA1 file in gfa1_fn and return a dict of dataframes where the + keys are header, segments, links, containments, and paths. Values are + DataFrames, with the exception of the header""" + + with open(gfa1_fn, 'r') as gfa: + + gfa1 = {} + + data = [ + x.strip().split("\t") + for x in gfa.readlines() if x[0] in set(['H', 'S', 'L', 'C', 'P']) + ] + + gfa1['header'] = pd.DataFrame( + data=[x[0:2] for x in data if x[0] == 'H'], + columns=HEADER_COLS + ) + + gfa1['segments'] = pd.DataFrame( + data=[x[0:4] for x in data if x[0] == "S"], + columns=SEGMENT_COLS + ) + + gfa1['links'] = pd.DataFrame( + data=[x[0:6] for x in data if x[0] == 'L'], + columns=LINK_COLS + ) + + gfa1['containments'] = pd.DataFrame( + data=[x[0:7] for x in data if x[0] == 'C'], + columns=CONTAINMENT_COLS + ).astype({'Pos': np.int}) + + gfa1['paths'] = pd.DataFrame( + data=[x[0:4] for x in data if x[0] == 'P'], + columns=PATH_COLS + ) + + return gfa1 diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index c6fe705..b59d3ec 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -5,15 +5,19 @@ import pandas as pd import numpy as np + +from exfi.io.read_gfa import \ + HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS + HEADER = pd.DataFrame( data=[["H", "VN:Z:1.0"]], - columns=["RecordType", "Version"] + columns=HEADER_COLS ) SEGMENTS_EMPTY = pd.DataFrame( - columns=["RecordType", "name", "sequence", "SegmentLength"] + columns=SEGMENT_COLS ) SEGMENTS_SIMPLE = pd.DataFrame( @@ -27,7 +31,7 @@ "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", "LN:i:326" ]], - columns=["RecordType", "name", "sequence", "SegmentLength"] + columns=SEGMENT_COLS ) SEGMENTS_COMPLEX = pd.DataFrame( @@ -118,7 +122,7 @@ "CCACAGCG", "LN:i:148" ]], - columns=["RecordType", "name", "sequence", "SegmentLength"] + columns=SEGMENT_COLS ) SEGMENTS_COMPLEX_SOFT = pd.DataFrame( @@ -209,7 +213,7 @@ "CCACAGCG", "LN:i:148" ]], - columns=["RecordType", "name", "sequence", "SegmentLength"] + columns=SEGMENT_COLS ) SEGMENTS_COMPLEX_HARD = pd.DataFrame( @@ -300,18 +304,18 @@ "CCACAGCG", "LN:i:148" ]], - columns=["RecordType", "name", "sequence", "SegmentLength"] + columns=SEGMENT_COLS ) LINKS_EMPTY = pd.DataFrame( - columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] + columns=LINK_COLS ) LINKS_SIMPLE = pd.DataFrame( - columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] + columns=LINK_COLS ) LINKS_COMPLEX = pd.DataFrame( @@ -355,14 +359,13 @@ "L", "ENSDART00000165342.1:1098-1175", "+", "ENSDART00000165342.1:1176-1324", "+", "1N" ]], - columns=["RecordType", "From", "FromOrient", "To", "ToOrient", "Overlap"] + columns=LINK_COLS ) CONTAINMENTS_EMPTY = pd.DataFrame( - columns=["RecordType", "Container", "ContainerOrient", "Contained", - "ContainedOrient", "Pos", "Overlap"] + columns=CONTAINMENT_COLS ) CONTAINMENTS_EMPTY = CONTAINMENTS_EMPTY.astype({"Overlap": np.int64}) @@ -371,8 +374,7 @@ "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:0-326", "+", 0, "326M" ]], - columns=["RecordType", "Container", "ContainerOrient", "Contained", - "ContainedOrient", "Pos", "Overlap"] + columns=CONTAINMENT_COLS ) CONTAINMENTS_COMPLEX = pd.DataFrame( data=[[ @@ -421,17 +423,16 @@ "C", "ENSDART00000165342.1", "+", "ENSDART00000165342.1:1176-1324", "+", 1176, "148M" ]], - columns=["RecordType", "Container", "ContainerOrient", "Contained", - "ContainedOrient", "Pos", "Overlap"] + columns=CONTAINMENT_COLS ) PATHS_EMPTY = pd.DataFrame( - columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] + columns=PATH_COLS ) PATHS_SIMPLE = pd.DataFrame( data=[["P", "ENSDART00000161035.1", "ENSDART00000161035.1:0-326+", "*"]], - columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] + columns=PATH_COLS ) PATHS_COMPLEX = pd.DataFrame( data=[[ @@ -450,7 +451,7 @@ "ENSDART00000165342.1:1098-1175+,ENSDART00000165342.1:1176-1324+", "*" ]], - columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] + columns=PATH_COLS ) GFA1_EMPTY_FN = "tests/io/empty.gfa" diff --git a/tests/test_io/test_bed4_to_gfa1.py b/tests/test_io/test_bed4_to_gfa1.py index 82c2da1..bc4dc2e 100644 --- a/tests/test_io/test_bed4_to_gfa1.py +++ b/tests/test_io/test_bed4_to_gfa1.py @@ -51,16 +51,19 @@ class TestComputeSegments(TestCase): def test_empty(self): """exfi.io.bed4_to_gfa1.compute_segments: empty case""" observed = compute_segments(BED4_EMPTY, TRANSCRIPTOME_EMPTY_DICT) + print("Observed:", observed, "Expected:", SEGMENTS_EMPTY, sep="\n") self.assertTrue(observed.equals(SEGMENTS_EMPTY)) def test_simple(self): """exfi.io.bed4_to_gfa1.compute_segments: simple case""" observed = compute_segments(BED4_SIMPLE, TRANSCRIPTOME_SIMPLE_DICT) + print("Observed:", observed, "Expected:", SEGMENTS_SIMPLE, sep="\n") self.assertTrue(observed.equals(SEGMENTS_SIMPLE)) def test_complex(self): """exfi.io.bed4_to_gfa1.compute_segments: complex case""" observed = compute_segments(BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT) + print("Observed:", observed, "Expected:", SEGMENTS_COMPLEX, sep="\n") self.assertTrue(observed.equals(SEGMENTS_COMPLEX)) def test_complex_simple(self): @@ -68,6 +71,8 @@ def test_complex_simple(self): observed = compute_segments( BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT, 'soft' ) + print("Observed:", observed, "Expected:", SEGMENTS_COMPLEX_SOFT, + sep="\n") self.assertTrue(observed.equals(SEGMENTS_COMPLEX_SOFT)) @@ -76,6 +81,8 @@ def test_complex_hard(self): observed = compute_segments( BED4_COMPLEX, TRANSCRIPTOME_COMPLEX_DICT, 'hard' ) + print("Observed:", observed, "Expected:", SEGMENTS_COMPLEX_HARD, + sep="\n") self.assertTrue(observed.equals(SEGMENTS_COMPLEX_HARD)) diff --git a/tests/test_io/test_read_gfa.py b/tests/test_io/test_read_gfa.py new file mode 100644 index 0000000..6405d21 --- /dev/null +++ b/tests/test_io/test_read_gfa.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +"""tests.test_io.test_read_gfa.py: tests for exfi.io.read_gfa.py""" + + +from unittest import TestCase, main + +from exfi.io.read_gfa import read_gfa1 + +from tests.io.gfa1 import \ + HEADER, \ + SEGMENTS_EMPTY, SEGMENTS_SIMPLE, SEGMENTS_COMPLEX, \ + SEGMENTS_COMPLEX_SOFT, SEGMENTS_COMPLEX_HARD, \ + LINKS_EMPTY, LINKS_SIMPLE, LINKS_COMPLEX, \ + CONTAINMENTS_EMPTY, CONTAINMENTS_SIMPLE, CONTAINMENTS_COMPLEX, \ + PATHS_EMPTY, PATHS_SIMPLE, PATHS_COMPLEX, \ + GFA1_EMPTY_FN, GFA1_SIMPLE_FN, GFA1_COMPLEX_FN, \ + GFA1_COMPLEX_SOFT_FN, GFA1_COMPLEX_HARD_FN + +class TestReadGFA1(TestCase): + """Tests for exfi.io.read_gfa.read_gfa1""" + + def test_empty(self): + """exfi.io.read_gfa.read_gfa1: empty case""" + gfa1 = read_gfa1(GFA1_EMPTY_FN) + self.assertTrue(gfa1['header'].equals(HEADER)) + self.assertTrue(gfa1['segments'].equals(SEGMENTS_EMPTY)) + self.assertTrue(gfa1['links'].equals(LINKS_EMPTY)) + self.assertTrue(gfa1['containments'].equals(CONTAINMENTS_EMPTY)) + self.assertTrue(gfa1['paths'].equals(PATHS_EMPTY)) + + def test_simple(self): + """exfi.io.read_gfa.read_gfa1: simple case""" + gfa1 = read_gfa1(GFA1_SIMPLE_FN) + self.assertTrue(gfa1['header'].equals(HEADER)) + self.assertTrue(gfa1['segments'].equals(SEGMENTS_SIMPLE)) + self.assertTrue(gfa1['links'].equals(LINKS_SIMPLE)) + self.assertTrue(gfa1['containments'].equals(CONTAINMENTS_SIMPLE)) + self.assertTrue(gfa1['paths'].equals(PATHS_SIMPLE)) + + def test_complex(self): + """exfi.io.read_gfa.read_gfa1: complex case""" + gfa1 = read_gfa1(GFA1_COMPLEX_FN) + self.assertTrue(gfa1['header'].equals(HEADER)) + self.assertTrue(gfa1['segments'].equals(SEGMENTS_COMPLEX)) + self.assertTrue(gfa1['links'].equals(LINKS_COMPLEX)) + self.assertTrue(gfa1['containments'].equals(CONTAINMENTS_COMPLEX)) + self.assertTrue(gfa1['paths'].equals(PATHS_COMPLEX)) + + def test_complex_soft(self): + """exfi.io.read_gfa.read_gfa1: complex and soft masking case""" + gfa1 = read_gfa1(GFA1_COMPLEX_SOFT_FN) + self.assertTrue(gfa1['header'].equals(HEADER)) + self.assertTrue(gfa1['segments'].equals(SEGMENTS_COMPLEX_SOFT)) + self.assertTrue(gfa1['links'].equals(LINKS_COMPLEX)) + self.assertTrue(gfa1['containments'].equals(CONTAINMENTS_COMPLEX)) + self.assertTrue(gfa1['paths'].equals(PATHS_COMPLEX)) + + def test_complex_hard(self): + """exfi.io.read_gfa.read_gfa1: complex and hard masking case""" + gfa1 = read_gfa1(GFA1_COMPLEX_HARD_FN) + self.assertTrue(gfa1['header'].equals(HEADER)) + self.assertTrue(gfa1['segments'].equals(SEGMENTS_COMPLEX_HARD)) + self.assertTrue(gfa1['links'].equals(LINKS_COMPLEX)) + self.assertTrue(gfa1['containments'].equals(CONTAINMENTS_COMPLEX)) + self.assertTrue(gfa1['paths'].equals(PATHS_COMPLEX)) + + + +if __name__ == '__main__': + main() From a972e36612fc0447c9cd74d21e45457eaf381da6 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 15:05:06 +0100 Subject: [PATCH 38/45] Using standardized names for bed and gfa dataframes --- exfi/correct.py | 5 ++-- exfi/find_exons.py | 10 +++----- exfi/io/bed.py | 45 ++++++++++++++++++++++------------ exfi/io/bed4_to_gfa1.py | 45 +++++++++++++++++----------------- exfi/io/gfa1.py | 32 ++++++++++++++++++------ exfi/io/gfa1_to_bed.py | 11 ++++----- exfi/io/gfa1_to_fasta.py | 47 ++++++++++++++++++++++------------- exfi/io/gff3_to_bed.py | 18 ++++++-------- exfi/io/read_bed.py | 8 +++--- exfi/io/read_gfa.py | 14 +++++------ exfi/polish.py | 53 ++++++++++++++++++++-------------------- tests/io/bed.py | 30 +++++++++-------------- tests/io/gfa1.py | 37 ++++++++++++++-------------- tests/test_correct.py | 5 ++-- 14 files changed, 199 insertions(+), 161 deletions(-) diff --git a/exfi/correct.py b/exfi/correct.py index 511ed0a..7ee22b2 100755 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -12,6 +12,7 @@ import pandas as pd from exfi.io.bed import \ + BED3_COLS, \ bed3_to_bed4, \ bed4_to_node2sequence, \ bed4_to_edge2overlap @@ -150,9 +151,9 @@ def apply_correction_to_bed4(bed4, sealed_edges): return bed4 new_bed4 = bed4.copy().set_index("name") for row in sealed_edges.iloc[::-1].itertuples(): - new_bed4.loc[row.u, "chromEnd"] = new_bed4.loc[row.v, "chromEnd"] + new_bed4.loc[row.u, "chrom_end"] = new_bed4.loc[row.v, "chrom_end"] new_bed4 = new_bed4.drop(sealed_edges.v) - new_bed4 = bed3_to_bed4(new_bed4[["chrom", "chromStart", "chromEnd"]]) + new_bed4 = bed3_to_bed4(new_bed4[BED3_COLS]) return new_bed4.reset_index(drop=True) diff --git a/exfi/find_exons.py b/exfi/find_exons.py index f73ca3d..79fcbff 100644 --- a/exfi/find_exons.py +++ b/exfi/find_exons.py @@ -14,7 +14,8 @@ from subprocess import Popen, PIPE import pandas as pd -import numpy as np +from exfi.io.bed import BED3_COLS, BED3_DTYPES + def process_output(process): """Get lines in bed format from the output of a Popen. @@ -27,11 +28,8 @@ def process_output(process): stdout_line.decode().strip().split() for stdout_line in iter(process.stdout.readline, b'') ], - columns=["chrom", "chromStart", "chromEnd"] - ) - - bed3.chromStart = bed3.chromStart.astype(np.int64) - bed3.chromEnd = bed3.chromEnd.astype(np.int64) + columns=BED3_COLS + ).astype(BED3_DTYPES) process.stdout.close() process.wait() diff --git a/exfi/io/bed.py b/exfi/io/bed.py index c6a7ad5..ae970b4 100644 --- a/exfi/io/bed.py +++ b/exfi/io/bed.py @@ -2,6 +2,21 @@ """exfi.io.bed.py: submodule to wrangle BED dataframes""" +import numpy as np + +BED3_COLS = ['chrom', 'chrom_start', 'chrom_end'] +BED3_DTYPES = {'chrom': np.str, 'chrom_start': np.int64, 'chrom_end': np.int64} + + +BED4_COLS = BED3_COLS + ['name'] +BED4_DTYPES = { + 'chrom': np.str, + 'chrom_start': np.int64, + 'chrom_end': np.int64, + 'name': np.str +} + + def bed3_to_bed4(bed3): """Take a BED3 dataframe and add the name as: @@ -10,15 +25,15 @@ def bed3_to_bed4(bed3): bed4 = bed3.copy() bed4["name"] = \ bed4.chrom + ":" + \ - bed4.chromStart.map(str) + "-" + \ - bed4.chromEnd.map(str) + bed4.chrom_start.map(str) + "-" + \ + bed4.chrom_end.map(str) return bed4 def bed4_to_node2coordinates(bed4): """Compute the node2coordinates DataFrame: exon name, chrom, start, end""" node2coordinates = bed4\ - [["name", "chrom", "chromStart", "chromEnd"]]\ + [["name", "chrom", "chrom_start", "chrom_end"]]\ .set_index("name") return node2coordinates @@ -28,7 +43,7 @@ def bed4_to_path2nodes(bed4): {transcript_id : list of exons}. """ return bed4\ - .drop(columns=["chromStart", "chromEnd"])\ + .drop(columns=["chrom_start", "chrom_end"])\ .groupby("chrom")\ .agg(lambda x: x.tolist())\ .to_dict()["name"] @@ -42,8 +57,8 @@ def bed4_to_node2sequence(bed4, transcriptome_dict): node2sequence["sequence"] = node2sequence.chrom.map(transcriptome_dict) node2sequence["data_to_map"] = list(zip( node2sequence.sequence, - node2sequence.chromStart, - node2sequence.chromEnd + node2sequence.chrom_start, + node2sequence.chrom_end )) node2sequence.sequence = node2sequence.data_to_map.map(lambda x: x[0][x[1]:x[2]]) return node2sequence[["name", "sequence"]] @@ -55,24 +70,24 @@ def bed4_to_edge2overlap(bed4): """ overlaps = bed4.copy() # Get the transcript_id of the next exon - overlaps["chromNext"] = overlaps["chrom"].shift(-1) + overlaps["chrom_next"] = overlaps["chrom"].shift(-1) # Get the name of the next exon - overlaps["nameNext"] = overlaps["name"].shift(-1) + overlaps["name_next"] = overlaps["name"].shift(-1) # Get the start of the next exon - overlaps["chromStartNext"] = overlaps["chromStart"].shift(-1) + overlaps["chrom_start_next"] = overlaps["chrom_start"].shift(-1) # Get the end of the next exon - overlaps["chromEndNext"] = overlaps["chromEnd"].shift(-1) + overlaps["chrom_end_next"] = overlaps["chrom_end"].shift(-1) # Remove rows with different transcripts overlaps = overlaps\ - [overlaps["chrom"] == overlaps["chromNext"]] + [overlaps["chrom"] == overlaps["chrom_next"]] # Convert types - overlaps = overlaps.astype({"chromStartNext": int, "chromEndNext": int}) + overlaps = overlaps.astype({"chrom_start_next": int, "chrom_end_next": int}) # Compute the overlap - overlaps["overlap"] = overlaps["chromEnd"] - overlaps["chromStartNext"] + overlaps["overlap"] = overlaps["chrom_end"] - overlaps["chrom_start_next"] # Convert again just in case overlaps.astype({"overlap": int}) # Select and rename overlaps = overlaps\ - [["name", "nameNext", "overlap"]]\ - .rename({"name": "u", "nameNext": "v"}, axis=1) + [["name", "name_next", "overlap"]]\ + .rename({"name": "u", "name_next": "v"}, axis=1) return overlaps.reset_index(drop=True) diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 5c24e88..3fb2650 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -37,15 +37,14 @@ def compute_segments(bed4, transcriptome_dict, masking='none'): # Add the S and length columns - segments["RecordType"] = "S" + segments["record_type"] = "S" # Compute lengths - segments["Length"] = segments\ + segments["length"] = segments\ .sequence.map(lambda x: "LN:i:" + str(len(x))) # reorder segments = segments\ - .rename(columns={'name': 'Name', 'sequence': 'Sequence'})\ [SEGMENT_COLS] return segments @@ -53,12 +52,12 @@ def compute_segments(bed4, transcriptome_dict, masking='none'): def compute_links(bed4): """Compute the Links subdataframe of a GFA1 file.""" - links = bed4_to_edge2overlap(bed4=bed4) - links.columns = ["From", "To", "Overlap"] - links["RecordType"] = "L" - links["FromOrient"] = "+" - links["ToOrient"] = "+" - links["Overlap"] = links.Overlap.map(lambda x: str(x) + "M" if x >= 0 else str(-x) + "N") + links = bed4_to_edge2overlap(bed4=bed4)\ + .rename(columns={'u': 'from', 'v': 'to'}) + links["record_type"] = "L" + links["from_orient"] = "+" + links["to_orient"] = "+" + links["overlap"] = links.overlap.map(lambda x: str(x) + "M" if x >= 0 else str(-x) + "N") links = links[LINK_COLS] return links @@ -66,15 +65,17 @@ def compute_links(bed4): def compute_containments(bed4): """Create the minimal containments subdataframe""" containments = bed4.copy() - containments["RecordType"] = "C" - containments["Container"] = containments["chrom"] - containments["ContainerOrient"] = "+" - containments["Contained"] = containments["name"] - containments["ContainedOrient"] = "+" - containments["Pos"] = containments["chromStart"] - containments["Overlap"] = containments["chromEnd"] - containments["chromStart"] - containments["Overlap"] = containments.Overlap.map(lambda x: str(x) + "M") - containments = containments.drop(["chrom", "chromStart", "chromEnd", "name"], axis=1) + containments["record_type"] = "C" + containments["container"] = containments["chrom"] + containments["container_orient"] = "+" + containments["contained"] = containments["name"] + containments["contained_orient"] = "+" + containments["pos"] = containments["chrom_start"] + containments["overlap"] = containments["chrom_end"] - containments["chrom_start"] + containments["overlap"] = containments.overlap.map(lambda x: str(x) + "M") + containments = containments.drop( + ["chrom", "chrom_start", "chrom_end", "name"], axis=1 + ) return containments[CONTAINMENT_COLS] @@ -83,14 +84,14 @@ def compute_paths(bed4): paths = bed4.copy() paths["name"] = paths["name"].map(lambda x: x + "+") paths = paths\ - .drop(columns=["chromStart", "chromEnd"])\ + .drop(columns=["chrom_start", "chrom_end"])\ .groupby("chrom", axis=0)\ .aggregate(lambda x: ",".join(x.tolist())) paths = paths.astype({"name": str}) # It may end up as float paths = paths.reset_index(drop=False) - paths["RecordType"] = "P" - paths = paths.rename({"chrom": "PathName", "name": "SegmentNames"}, axis=1) - paths["Overlaps"] = "*" + paths["record_type"] = "P" + paths = paths.rename({"chrom": "path_name", "name": "segment_names"}, axis=1) + paths["overlaps"] = "*" paths = paths[PATH_COLS] return paths diff --git a/exfi/io/gfa1.py b/exfi/io/gfa1.py index 17f1121..d5c9c23 100644 --- a/exfi/io/gfa1.py +++ b/exfi/io/gfa1.py @@ -1,12 +1,30 @@ #!/usr/bin/env python3 -"""exfi.io.gfa1.py: submodule for auxliary variables for gfa""" +'''exfi.io.gfa1.py: submodule for auxliary variables for gfa''' -HEADER_COLS = ['RecordType', 'VersionNumber'] -SEGMENT_COLS = ['RecordType', "Name", "Sequence", 'Length'] -LINK_COLS = ['RecordType', "From", "FromOrient", "To", "ToOrient", "Overlap"] +HEADER_COLS = ['record_type', 'version_number'] +SEGMENT_COLS = ['record_type', 'name', 'sequence', 'length'] +LINK_COLS = ['record_type', 'from', 'from_orient', 'to', 'to_orient', 'overlap'] CONTAINMENT_COLS = [ - 'RecordType', 'Container', 'ContainerOrient', 'Contained', - 'ContainedOrient', 'Pos', 'Overlap' + 'record_type', 'container', 'container_orient', 'contained', + 'contained_orient', 'pos', 'overlap' ] -PATH_COLS = ['RecordType', 'PathName', 'SegmentNames', 'Overlaps'] +PATH_COLS = ['record_type', 'path_name', 'segment_names', 'overlaps'] + + +HEADER_DTYPES = {'record_type': object, 'version_number': object} +SEGMENT_DTYPES = { + 'record_type': object, 'name': object, 'sequence': object, 'length': object +} +LINK_DTYPES = { + 'record_type': object, 'from': object, 'from_orient': object, 'to': object, + 'to_orient': object, 'overlap': object} +CONTAINMENT_DTYPES = { + 'record_type': object, 'container': object, 'container_orient': object, + 'contained': object, 'contained_orient': object, 'pos': int, + 'overlap': object +} +PATH_DTYPES = { + 'record_type': object, 'path_name': object, 'segment_names': object, + 'overlaps': object +} diff --git a/exfi/io/gfa1_to_bed.py b/exfi/io/gfa1_to_bed.py index 658c539..0663b12 100644 --- a/exfi/io/gfa1_to_bed.py +++ b/exfi/io/gfa1_to_bed.py @@ -6,6 +6,7 @@ import pandas as pd import numpy as np +from exfi.io.bed import BED4_COLS, BED4_DTYPES def gfa1_to_bed4(filename): """Read a GFA1 file and convert it to BED4""" @@ -27,10 +28,8 @@ def gfa1_to_bed4(filename): }, axis=1) containments["Overlap"] = containments["Overlap"]\ .map(lambda x: np.int(x[:-1])) - containments["chromStart"] = containments["Pos"] - containments["chromEnd"] = containments["Pos"] + containments["Overlap"] - containments = containments[["chrom", "chromStart", "chromEnd", "name"]] - containments = containments.astype( - {"chromStart": np.int64, "chromEnd": np.int64} - ) + containments["chrom_start"] = containments["Pos"] + containments["chrom_end"] = containments["Pos"] + containments["Overlap"] + containments = containments[BED4_COLS] + containments = containments.astype(BED4_DTYPES) return containments diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index 590c919..3d8f316 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -5,34 +5,47 @@ import pandas as pd +from exfi.io.read_gfa import read_gfa1 from exfi.io.masking import mask, cigar_to_int def gfa1_to_exons(fasta_out, gfa1_in, masking='none'): """Extract the exons in Fasta format""" - with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: + with open(fasta_out, "w") as fasta: - data = [ - x.strip().split("\t") - for x in gfa.readlines() if x[0] in set(["S", "L"]) - ] + gfa1 = read_gfa1(gfa1_in) - if not data: - return + # data = [ + # x.strip().split("\t") + # for x in gfa.readlines() if x[0] in set(["S", "L"]) + # ] - node2sequence = pd.DataFrame( - data=[x[0:3] for x in data if x[0] == "S"], - columns=["RecordType", "name", "sequence"] - ).drop(columns="RecordType") + # if not data: + # return + + # node2sequence = pd.DataFrame( + # data=[x[0:3] for x in data if x[0] == "S"], + # columns=["RecordType", "name", "sequence"] + # ).drop(columns="RecordType") + + node2sequence = gfa1['segments']\ + .drop(columns='record_type') if node2sequence.shape[0] == 0: return - edge2overlap = pd.DataFrame( - data=[x[0:6] for x in data if x[0] == 'L'], - columns=["RecordType", "u", "FromOrient", "v", "ToOrient", - "OverlapCigar"] - ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) - edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) + # edge2overlap = pd.DataFrame( + # data=[x[0:6] for x in data if x[0] == 'L'], + # columns=["RecordType", "u", "FromOrient", "v", "ToOrient", + # "OverlapCigar"] + # ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) + # edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) + + edge2overlap = gfa1['links']\ + .drop(columns=['record_type', 'from_orient', 'to_orient'])\ + .rename(columns={ + 'from': 'u', 'to': 'v', 'overlap': 'overlap_cigar' + }) + edge2overlap['overlap'] = edge2overlap.overlap_cigar.map(cigar_to_int) node2sequence = mask( node2sequence=node2sequence, diff --git a/exfi/io/gff3_to_bed.py b/exfi/io/gff3_to_bed.py index 48faaa3..0db9019 100644 --- a/exfi/io/gff3_to_bed.py +++ b/exfi/io/gff3_to_bed.py @@ -6,7 +6,8 @@ import sys import pandas as pd -import numpy as np + +from exfi.io.bed import BED3_COLS, BED3_DTYPES def gff3_to_bed3(gff3_in, mode="ensembl"): """Read a GFF3 file and convert it to BED3, where coordinates are with @@ -23,11 +24,6 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): "attributes" ] - bed3_columns = ["chrom", "chromStart", "chromEnd"] - bed3_dtypes = { - "chrom": np.str, "chromStart": np.int64, "chromEnd": np.int64 - } - raw = pd.read_csv( sep='\t', na_values=".", @@ -40,8 +36,8 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): ) if raw.shape[0] == 0: - exons = pd.DataFrame(columns=bed3_columns) - exons = exons.astype(bed3_dtypes) + exons = pd.DataFrame(columns=BED3_COLS) + exons = exons.astype(BED3_DTYPES) return exons if mode == "gmap": @@ -96,11 +92,11 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): merged = merged.rename(columns={ 'transcript_id': 'chrom', - 'transcript_start': 'chromStart', - 'transcript_end': 'chromEnd' + 'transcript_start': 'chrom_start', + 'transcript_end': 'chrom_end' }) - merged = merged.astype(bed3_dtypes) + merged = merged.astype(BED3_DTYPES) merged = merged.reset_index(drop=True) diff --git a/exfi/io/read_bed.py b/exfi/io/read_bed.py index 9c7d097..d7e306e 100644 --- a/exfi/io/read_bed.py +++ b/exfi/io/read_bed.py @@ -3,7 +3,8 @@ """exfi.io.read_bed.py: BED importer""" import pandas as pd -import numpy as np + +from exfi.io.bed import BED3_COLS, BED3_DTYPES def read_bed3(filename): @@ -13,8 +14,7 @@ def read_bed3(filename): header=None, sep='\t', usecols=[0, 1, 2], - names=["chrom", "chromStart", "chromEnd"], - dtype={"chrom": np.str, "chromStart": np.int64, "chromEnd": np.int64}, + names=BED3_COLS, engine='c' - ) + ).astype(BED3_DTYPES) return bed3 diff --git a/exfi/io/read_gfa.py b/exfi/io/read_gfa.py index 64a560f..b3d79af 100644 --- a/exfi/io/read_gfa.py +++ b/exfi/io/read_gfa.py @@ -4,10 +4,10 @@ import pandas as pd -import numpy as np from exfi.io.gfa1 import \ - HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS + HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS, \ + HEADER_DTYPES, SEGMENT_DTYPES, LINK_DTYPES, CONTAINMENT_DTYPES, PATH_DTYPES def read_gfa1(gfa1_fn): @@ -27,26 +27,26 @@ def read_gfa1(gfa1_fn): gfa1['header'] = pd.DataFrame( data=[x[0:2] for x in data if x[0] == 'H'], columns=HEADER_COLS - ) + ).astype(HEADER_DTYPES) gfa1['segments'] = pd.DataFrame( data=[x[0:4] for x in data if x[0] == "S"], columns=SEGMENT_COLS - ) + ).astype(SEGMENT_DTYPES) gfa1['links'] = pd.DataFrame( data=[x[0:6] for x in data if x[0] == 'L'], columns=LINK_COLS - ) + ).astype(LINK_DTYPES) gfa1['containments'] = pd.DataFrame( data=[x[0:7] for x in data if x[0] == 'C'], columns=CONTAINMENT_COLS - ).astype({'Pos': np.int}) + ).astype(CONTAINMENT_DTYPES) gfa1['paths'] = pd.DataFrame( data=[x[0:4] for x in data if x[0] == 'P'], columns=PATH_COLS - ) + ).astype(PATH_DTYPES) return gfa1 diff --git a/exfi/polish.py b/exfi/polish.py index cb8b317..77a4ba2 100755 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -11,26 +11,26 @@ def polish_bed4(bed4, transcriptome_dict): polished = bed4.copy() # Get the transcript_id of the next exon - polished["chromNext"] = polished["chrom"].shift(-1) + polished["chrom_next"] = polished["chrom"].shift(-1) # Get the name of the next exon - polished["nameNext"] = polished["name"].shift(-1) + polished["name_next"] = polished["name"].shift(-1) # Get the start of the next exon - polished["chromStartNext"] = polished["chromStart"].shift(-1) + polished["chrom_start_next"] = polished["chrom_start"].shift(-1) # Get the end of the next exon - polished["chromEndNext"] = polished["chromEnd"].shift(-1) + polished["chrom_end_next"] = polished["chrom_end"].shift(-1) # Remove rows with different transcripts polished = polished\ - [polished["chrom"] == polished["chromNext"]] + [polished["chrom"] == polished["chrom_next"]] # cast from float to int - polished = polished.astype({"chromStartNext": int, "chromEndNext": int}) + polished = polished.astype({"chrom_start_next": int, "chrom_end_next": int}) # compute the overlap - polished["overlap"] = polished["chromEnd"] - polished["chromStartNext"] + polished["overlap"] = polished["chrom_end"] - polished["chrom_start_next"] # Throw away lines that cannot be polished polished = polished[polished.overlap >= 4] @@ -41,8 +41,8 @@ def polish_bed4(bed4, transcriptome_dict): # Prepare a column with the data required to extract the overlapping seq polished["data_to_map"] = list(zip( polished.sequence, - polished.chromStartNext + 1, - polished.chromEnd + 1 + polished.chrom_start_next + 1, + polished.chrom_end + 1 )) # Get the overlapping sequence @@ -57,37 +57,38 @@ def polish_bed4(bed4, transcriptome_dict): polished = polished[polished.overlap_index >= 0] # Correct positions - polished["chromEndCorrected"] = polished["chromEnd"] - 2 - polished["chromStartNextCorrected"] = \ - polished["chromStartNext"] + polished["overlap_index"] + 2 + polished["chrom_end_corrected"] = polished["chrom_end"] - 2 + polished["chrom_start_next_corrected"] = \ + polished["chrom_start_next"] + polished["overlap_index"] + 2 # Organize the elements to correct ends_to_change = polished\ - [["name", "chromEndCorrected"]]\ - .rename({"chromEndCorrected": "chromEnd"}, axis=1)\ + [["name", "chrom_end_corrected"]]\ + .rename({"chrom_end_corrected": "chrom_end"}, axis=1)\ .set_index("name") starts_to_change = polished\ - [["nameNext", "chromStartNextCorrected"]]\ - .rename( - {"nameNext": "name", "chromStartNextCorrected": "chromStart"}, - axis=1 - )\ + [["name_next", "chrom_start_next_corrected"]]\ + .rename(columns={ + "name_next": "name", + "chrom_start_next_corrected": + "chrom_start" + })\ .set_index("name") bed4_new = bed4.set_index("name") # Correct the starts - bed4_new.loc[starts_to_change.index.tolist()].chromStart = \ - starts_to_change.chromStart + bed4_new.loc[starts_to_change.index.tolist()].chrom_start = \ + starts_to_change.chrom_start # Correct the ends - bed4_new.loc[ends_to_change.index.tolist()].chromEnd = \ - ends_to_change.chromEnd + bed4_new.loc[ends_to_change.index.tolist()].chrom_end = \ + ends_to_change.chrom_end bed4_new = bed4_new.reset_index(drop=False) bed4_new["name"] = \ bed4_new.chrom + ":" + \ - bed4_new.chromStart.map(str) + "-" + \ - bed4_new.chromEnd.map(str) + bed4_new.chrom_start.map(str) + "-" + \ + bed4_new.chrom_end.map(str) - return bed4_new[["chrom", "chromStart", "chromEnd", "name"]] + return bed4_new[["chrom", "chrom_start", "chrom_end", "name"]] diff --git a/tests/io/bed.py b/tests/io/bed.py index f1538d0..c0b0510 100644 --- a/tests/io/bed.py +++ b/tests/io/bed.py @@ -5,26 +5,21 @@ import pandas as pd import numpy as np -BED3_COLUMNS = ['chrom', 'chromStart', 'chromEnd'] -BED3_DTYPES = {'chrom': np.str, 'chromStart': np.int64, 'chromEnd': np.int64} - -BED4_COLUMNS = ['chrom', 'chromStart', 'chromEnd', 'name'] -BED4_DTYPES = { - 'chrom': np.str, 'chromStart': np.int64, 'chromEnd': np.int64, 'name':np.str -} - +from exfi.io.bed import \ + BED3_COLS, BED3_DTYPES, \ + BED4_COLS, BED4_DTYPES BED3_EMPTY_FN = "tests/io/empty.bed" BED3_SIMPLE_FN = "tests/io/simple.bed" BED3_COMPLEX_FN = "tests/io/complex.bed" -BED3_EMPTY = pd.DataFrame(columns=BED3_COLUMNS) +BED3_EMPTY = pd.DataFrame(columns=BED3_COLS) BED3_EMPTY = BED3_EMPTY.astype(BED3_DTYPES) BED3_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326)], - columns=BED3_COLUMNS + columns=BED3_COLS ) BED3_COMPLEX = pd.DataFrame( @@ -45,18 +40,18 @@ ["ENSDART00000165342.1", 1098, 1175], ["ENSDART00000165342.1", 1176, 1324] ], - columns=BED3_COLUMNS + columns=BED3_COLS ) -BED4_EMPTY = pd.DataFrame(columns=BED4_COLUMNS) +BED4_EMPTY = pd.DataFrame(columns=BED4_COLS) BED4_EMPTY = BED4_EMPTY.astype(BED4_DTYPES) BED4_SIMPLE = pd.DataFrame( data=[("ENSDART00000161035.1", 0, 326, "ENSDART00000161035.1:0-326")], - columns=BED4_COLUMNS + columns=BED4_COLS ) BED4_COMPLEX = pd.DataFrame( @@ -77,7 +72,7 @@ ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] ], - columns=BED4_COLUMNS + columns=BED4_COLS ) @@ -279,9 +274,8 @@ ['ENSDART00000167898', 424, 488], ['ENSDART00000167898', 488, 605] ], - columns=BED3_COLUMNS + columns=BED3_COLS ) -# BED3_NCBI = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd"]) BED3_GMAP = pd.DataFrame( data=[ ['ENSDART00000171570', 0, 61], @@ -361,7 +355,7 @@ ['ENSDART00000172182', 61, 413], ['ENSDART00000172374', 0, 355] ], - columns=BED3_COLUMNS + columns=BED3_COLS ) @@ -386,7 +380,7 @@ ["ENSDART00000165342.1", 1098, 1175, "ENSDART00000165342.1:1098-1175"], ["ENSDART00000165342.1", 1176, 1324, "ENSDART00000165342.1:1176-1324"] ], - columns=["chrom", "chromStart", "chromEnd", "name"] + columns=BED4_COLS ) diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index b59d3ec..6c7e3f4 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -6,19 +6,20 @@ import numpy as np -from exfi.io.read_gfa import \ - HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS +from exfi.io.gfa1 import \ + HEADER_COLS, SEGMENT_COLS, LINK_COLS, CONTAINMENT_COLS, PATH_COLS, \ + HEADER_DTYPES, SEGMENT_DTYPES, LINK_DTYPES, CONTAINMENT_DTYPES, PATH_DTYPES HEADER = pd.DataFrame( data=[["H", "VN:Z:1.0"]], columns=HEADER_COLS -) +).astype(HEADER_DTYPES) SEGMENTS_EMPTY = pd.DataFrame( columns=SEGMENT_COLS -) +).astype(SEGMENT_DTYPES) SEGMENTS_SIMPLE = pd.DataFrame( data=[[ @@ -32,7 +33,7 @@ "LN:i:326" ]], columns=SEGMENT_COLS -) +).astype(SEGMENT_DTYPES) SEGMENTS_COMPLEX = pd.DataFrame( data=[[ @@ -123,7 +124,7 @@ "LN:i:148" ]], columns=SEGMENT_COLS -) +).astype(SEGMENT_DTYPES) SEGMENTS_COMPLEX_SOFT = pd.DataFrame( data=[[ @@ -214,7 +215,7 @@ "LN:i:148" ]], columns=SEGMENT_COLS -) +).astype(SEGMENT_DTYPES) SEGMENTS_COMPLEX_HARD = pd.DataFrame( data=[[ @@ -305,18 +306,18 @@ "LN:i:148" ]], columns=SEGMENT_COLS -) +).astype(SEGMENT_DTYPES) LINKS_EMPTY = pd.DataFrame( columns=LINK_COLS -) +).astype(LINK_DTYPES) LINKS_SIMPLE = pd.DataFrame( columns=LINK_COLS -) +).astype(LINK_DTYPES) LINKS_COMPLEX = pd.DataFrame( data=[[ @@ -360,14 +361,13 @@ "ENSDART00000165342.1:1176-1324", "+", "1N" ]], columns=LINK_COLS -) +).astype(LINK_DTYPES) CONTAINMENTS_EMPTY = pd.DataFrame( columns=CONTAINMENT_COLS -) -CONTAINMENTS_EMPTY = CONTAINMENTS_EMPTY.astype({"Overlap": np.int64}) +).astype(CONTAINMENT_DTYPES) CONTAINMENTS_SIMPLE = pd.DataFrame( data=[[ @@ -375,7 +375,8 @@ 0, "326M" ]], columns=CONTAINMENT_COLS -) +).astype(CONTAINMENT_DTYPES) + CONTAINMENTS_COMPLEX = pd.DataFrame( data=[[ "C", "ENSDART00000161035.1", "+", "ENSDART00000161035.1:0-326", "+", @@ -424,16 +425,16 @@ "+", 1176, "148M" ]], columns=CONTAINMENT_COLS -) +).astype(CONTAINMENT_DTYPES) PATHS_EMPTY = pd.DataFrame( columns=PATH_COLS -) +).astype(PATH_DTYPES) PATHS_SIMPLE = pd.DataFrame( data=[["P", "ENSDART00000161035.1", "ENSDART00000161035.1:0-326+", "*"]], columns=PATH_COLS -) +).astype(PATH_DTYPES) PATHS_COMPLEX = pd.DataFrame( data=[[ "P", "ENSDART00000161035.1", @@ -452,7 +453,7 @@ "*" ]], columns=PATH_COLS -) +).astype(PATH_DTYPES) GFA1_EMPTY_FN = "tests/io/empty.gfa" GFA1_SIMPLE_FN = "tests/io/simple.gfa" diff --git a/tests/test_correct.py b/tests/test_correct.py index 16b1ae5..f3b5d32 100644 --- a/tests/test_correct.py +++ b/tests/test_correct.py @@ -26,6 +26,7 @@ fasta_to_dict from exfi.io.bed import \ + BED4_COLS, BED4_DTYPES, \ bed3_to_bed4 from exfi.correct import \ @@ -81,8 +82,8 @@ def _compose_args(bloom_fn: str, gfa_fn: str) -> dict: ["ENSDART00000149335.2", 0, 486, "ENSDART00000149335.2:0-486"], ["ENSDART00000149335.2", 485, 3379, "ENSDART00000149335.2:485-3379"] ], - columns=["chrom", "chromStart", "chromEnd", "name"] -) + columns=BED4_COLS +).astype(BED4_DTYPES) def tearDownModule(): From f00009128637496c7b53e0329857fa314b4af339 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 15:29:40 +0100 Subject: [PATCH 39/45] Universal gfa1 reader --- exfi/io/gfa1_to_bed.py | 29 ++++++--------- exfi/io/gfa1_to_fasta.py | 77 +++++++++++----------------------------- 2 files changed, 31 insertions(+), 75 deletions(-) diff --git a/exfi/io/gfa1_to_bed.py b/exfi/io/gfa1_to_bed.py index 0663b12..9003ec3 100644 --- a/exfi/io/gfa1_to_bed.py +++ b/exfi/io/gfa1_to_bed.py @@ -3,33 +3,24 @@ """exfi.io.gfa1_to_bed.py: submodule to read a GFA1 file and convert it to BED4 """ -import pandas as pd import numpy as np from exfi.io.bed import BED4_COLS, BED4_DTYPES +from exfi.io.read_gfa import read_gfa1 def gfa1_to_bed4(filename): """Read a GFA1 file and convert it to BED4""" - with open(filename, "r") as gfa: - containments = pd.DataFrame( - data=[ - x.strip().split("\t") for x in gfa.readlines() if x[0] == "C" - ], - columns=["RecordType", "Container", "ContainerOrient", "Contained", - "ContainedOrient", "Pos", "Overlap"], - dtype=None - )\ - .astype(dtype={"Pos": np.int}) + containments = read_gfa1(filename)['containments'] - containments = containments.rename({ - "Container": "chrom", - "Contained": "name" - }, axis=1) - containments["Overlap"] = containments["Overlap"]\ - .map(lambda x: np.int(x[:-1])) - containments["chrom_start"] = containments["Pos"] - containments["chrom_end"] = containments["Pos"] + containments["Overlap"] + containments = containments.rename(columns={ + "container": "chrom", + "contained": "name" + }) + containments["overlap"] = containments\ + .overlap.map(lambda x: np.int(x[:-1])) + containments["chrom_start"] = containments["pos"] + containments["chrom_end"] = containments["pos"] + containments["overlap"] containments = containments[BED4_COLS] containments = containments.astype(BED4_DTYPES) return containments diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index 3d8f316..20fa124 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -3,8 +3,6 @@ """exfi.io.gfa1_to_exons.py: submodule to read a gfa1, extract the exons and store it in fasta format""" -import pandas as pd - from exfi.io.read_gfa import read_gfa1 from exfi.io.masking import mask, cigar_to_int @@ -14,32 +12,12 @@ def gfa1_to_exons(fasta_out, gfa1_in, masking='none'): gfa1 = read_gfa1(gfa1_in) - # data = [ - # x.strip().split("\t") - # for x in gfa.readlines() if x[0] in set(["S", "L"]) - # ] - - # if not data: - # return - - # node2sequence = pd.DataFrame( - # data=[x[0:3] for x in data if x[0] == "S"], - # columns=["RecordType", "name", "sequence"] - # ).drop(columns="RecordType") - node2sequence = gfa1['segments']\ .drop(columns='record_type') if node2sequence.shape[0] == 0: return - # edge2overlap = pd.DataFrame( - # data=[x[0:6] for x in data if x[0] == 'L'], - # columns=["RecordType", "u", "FromOrient", "v", "ToOrient", - # "OverlapCigar"] - # ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) - # edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) - edge2overlap = gfa1['links']\ .drop(columns=['record_type', 'from_orient', 'to_orient'])\ .rename(columns={ @@ -66,44 +44,30 @@ def gfa1_to_gapped_transcripts( fasta_out, gfa1_in, gap_size=100, masking='none'): """Convert a GFA1 file to a gapped transcript file""" - with open(gfa1_in, "r") as gfa, open(fasta_out, "w") as fasta: + with open(fasta_out, "w") as fasta: separator = gap_size * 'N' - # Read only segments and paths - data = [ - x.strip().split("\t") - for x in gfa.readlines() if x[0] in set(["S", "P", "L"]) - ] + gfa1 = read_gfa1(gfa1_in) + + # Segments -> node2sequence + node2sequence = gfa1['segments']\ + .drop(columns=["record_type", 'length']) - if not data: + if node2sequence.shape[0] == 0: return - # Segments -> node2sequence - node2sequence = pd.DataFrame( - data=[x[0:3] for x in data if x[0] == "S"], - columns=["RecordType", "name", "sequence"], - )\ - .drop(columns="RecordType") - - # Links -> edge2overlap - edge2overlap = pd.DataFrame( - data=[x[0:6] for x in data if x[0] == 'L'], - columns=["RecordType", "u", "FromOrient", "v", "ToOrient", - "OverlapCigar"] - ).drop(columns=["RecordType", "FromOrient", "ToOrient"]) - edge2overlap["overlap"] = edge2overlap.OverlapCigar.map(cigar_to_int) - - # Paths -> path2nodes - path2nodes = pd.DataFrame( - data=[x[0:4] for x in data if x[0] == "P"], - columns=["RecordType", "PathName", "SegmentNames", "Overlaps"] - )\ - .drop(columns=["RecordType", "Overlaps"]) - path2nodes["SegmentNames"] = path2nodes["SegmentNames"]\ - .str.replace("+", "") - - del data + edge2overlap = gfa1['links']\ + .drop(columns=['record_type', 'from_orient', 'to_orient'])\ + .rename(columns={ + 'from': 'u', 'to': 'v', 'overlap': 'overlap_cigar' + }) + edge2overlap["overlap"] = edge2overlap.overlap_cigar.map(cigar_to_int) + + path2nodes = gfa1['paths']\ + .drop(columns=['record_type', 'overlaps']) + path2nodes.segment_names = path2nodes\ + .segment_names.str.replace('+', '') # Mask the sequences node2sequence = mask( @@ -118,13 +82,14 @@ def gfa1_to_gapped_transcripts( # Compose the sequence path2nodes["gapped_sequence"] = path2nodes\ - .SegmentNames\ + .segment_names\ .str.split(',')\ .map(lambda x: separator.join([node2sequence_dict[y] for y in x])) # Create the fasta line path2nodes["fasta"] = \ - ">" + path2nodes.PathName + " " + path2nodes.SegmentNames + "\n" + \ + ">" + path2nodes.path_name + " " + path2nodes.segment_names + \ + "\n" + \ path2nodes.gapped_sequence # Dump everything From 7ebf0c90664bcf87790f462644bffa47a478eb26 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 17:53:00 +0100 Subject: [PATCH 40/45] Faster and memory efficient masking --- exfi/io/masking.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/exfi/io/masking.py b/exfi/io/masking.py index 4e0469d..93515df 100644 --- a/exfi/io/masking.py +++ b/exfi/io/masking.py @@ -99,18 +99,18 @@ def mask(node2sequence, edge2overlap, masking: str = "none"): complete['mask_left'] = complete.mask_left\ .map(lambda x: x if x > 0 else 0) - complete['tmp'] = tuple(zip( - complete.sequence, - complete.mask_left, - complete.mask_right - )) - if masking == "hard": logging.info("\tHard masking sequences") - complete['sequence'] = complete.tmp.map(lambda x: hard_mask(*x)) + complete['sequence'] = complete.apply( + lambda x: hard_mask(x.sequence, x.mask_left, x.mask_right), + axis=1 + ) elif masking == "soft": logging.info("\tSoft masking sequences") - complete['sequence'] = complete.tmp.map(lambda x: soft_mask(*x)) + complete['sequence'] = complete.apply( + lambda x: soft_mask(x.sequence, x.mask_left, x.mask_right), + axis=1 + ) node2sequence_masked = complete\ [['name', 'sequence']]\ From a501c73f87f2a6ba4b6760163ef49dead8df5fe4 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 18:06:40 +0100 Subject: [PATCH 41/45] Removed segment lengths --- exfi/io/bed4_to_gfa1.py | 4 -- exfi/io/gfa1.py | 4 +- exfi/io/gfa1_to_fasta.py | 2 +- exfi/io/read_gfa.py | 2 +- tests/io/complex.gfa | 30 +++++----- tests/io/complex_hard.gfa | 30 +++++----- tests/io/complex_soft.gfa | 30 +++++----- tests/io/gfa1.py | 118 +++++++++++++++----------------------- tests/io/simple.gfa | 2 +- 9 files changed, 95 insertions(+), 127 deletions(-) diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 3fb2650..1576412 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -39,10 +39,6 @@ def compute_segments(bed4, transcriptome_dict, masking='none'): # Add the S and length columns segments["record_type"] = "S" - # Compute lengths - segments["length"] = segments\ - .sequence.map(lambda x: "LN:i:" + str(len(x))) - # reorder segments = segments\ [SEGMENT_COLS] diff --git a/exfi/io/gfa1.py b/exfi/io/gfa1.py index d5c9c23..227c1aa 100644 --- a/exfi/io/gfa1.py +++ b/exfi/io/gfa1.py @@ -3,7 +3,7 @@ '''exfi.io.gfa1.py: submodule for auxliary variables for gfa''' HEADER_COLS = ['record_type', 'version_number'] -SEGMENT_COLS = ['record_type', 'name', 'sequence', 'length'] +SEGMENT_COLS = ['record_type', 'name', 'sequence'] LINK_COLS = ['record_type', 'from', 'from_orient', 'to', 'to_orient', 'overlap'] CONTAINMENT_COLS = [ 'record_type', 'container', 'container_orient', 'contained', @@ -14,7 +14,7 @@ HEADER_DTYPES = {'record_type': object, 'version_number': object} SEGMENT_DTYPES = { - 'record_type': object, 'name': object, 'sequence': object, 'length': object + 'record_type': object, 'name': object, 'sequence': object } LINK_DTYPES = { 'record_type': object, 'from': object, 'from_orient': object, 'to': object, diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index 20fa124..52cd84b 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -52,7 +52,7 @@ def gfa1_to_gapped_transcripts( # Segments -> node2sequence node2sequence = gfa1['segments']\ - .drop(columns=["record_type", 'length']) + .drop(columns=["record_type"]) if node2sequence.shape[0] == 0: return diff --git a/exfi/io/read_gfa.py b/exfi/io/read_gfa.py index b3d79af..f542fae 100644 --- a/exfi/io/read_gfa.py +++ b/exfi/io/read_gfa.py @@ -30,7 +30,7 @@ def read_gfa1(gfa1_fn): ).astype(HEADER_DTYPES) gfa1['segments'] = pd.DataFrame( - data=[x[0:4] for x in data if x[0] == "S"], + data=[x[0:3] for x in data if x[0] == "S"], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) diff --git a/tests/io/complex.gfa b/tests/io/complex.gfa index 7cc0a18..20043b6 100644 --- a/tests/io/complex.gfa +++ b/tests/io/complex.gfa @@ -1,19 +1,19 @@ H VN:Z:1.0 -S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 -S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA LN:i:75 -S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA LN:i:46 -S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG LN:i:122 -S ENSDART00000165342.1:125-304 AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA LN:i:179 -S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAG LN:i:143 -S ENSDART00000165342.1:459-592 GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA LN:i:133 -S ENSDART00000165342.1:591-650 AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT LN:i:59 -S ENSDART00000165342.1:645-746 GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT LN:i:101 -S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA LN:i:105 -S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA LN:i:32 -S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA LN:i:54 -S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 -S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 -S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA +S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA +S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG +S ENSDART00000165342.1:125-304 AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAG +S ENSDART00000165342.1:459-592 GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA +S ENSDART00000165342.1:591-650 AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT +S ENSDART00000165342.1:645-746 GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA +S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA +S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA +S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA +S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M diff --git a/tests/io/complex_hard.gfa b/tests/io/complex_hard.gfa index 77cdf64..33aa374 100644 --- a/tests/io/complex_hard.gfa +++ b/tests/io/complex_hard.gfa @@ -1,19 +1,19 @@ H VN:Z:1.0 -S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 -S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA LN:i:75 -S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA LN:i:46 -S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN LN:i:122 -S ENSDART00000165342.1:125-304 NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA LN:i:179 -S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN LN:i:143 -S ENSDART00000165342.1:459-592 NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN LN:i:133 -S ENSDART00000165342.1:591-650 NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN LN:i:59 -S ENSDART00000165342.1:645-746 NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT LN:i:101 -S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA LN:i:105 -S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA LN:i:32 -S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA LN:i:54 -S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 -S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 -S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA +S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA +S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN +S ENSDART00000165342.1:125-304 NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAN +S ENSDART00000165342.1:459-592 NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN +S ENSDART00000165342.1:591-650 NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN +S ENSDART00000165342.1:645-746 NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA +S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA +S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA +S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA +S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M diff --git a/tests/io/complex_soft.gfa b/tests/io/complex_soft.gfa index a36b20d..b44df55 100644 --- a/tests/io/complex_soft.gfa +++ b/tests/io/complex_soft.gfa @@ -1,19 +1,19 @@ H VN:Z:1.0 -S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 -S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA LN:i:75 -S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA LN:i:46 -S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag LN:i:122 -S ENSDART00000165342.1:125-304 agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA LN:i:179 -S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg LN:i:143 -S ENSDART00000165342.1:459-592 gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa LN:i:133 -S ENSDART00000165342.1:591-650 aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct LN:i:59 -S ENSDART00000165342.1:645-746 gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT LN:i:101 -S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA LN:i:105 -S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA LN:i:32 -S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA LN:i:54 -S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA LN:i:123 -S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA LN:i:77 -S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG LN:i:148 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA +S ENSDART00000161035.1:397-472 AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAATCAACA +S ENSDART00000161035.1:477-523 AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA +S ENSDART00000165342.1:5-127 TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCACTTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag +S ENSDART00000165342.1:125-304 agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACAATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCACAGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA +S ENSDART00000165342.1:317-460 GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAGGTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTACCAg +S ENSDART00000165342.1:459-592 gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGGCTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa +S ENSDART00000165342.1:591-650 aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct +S ENSDART00000165342.1:645-746 gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAGAGTCTGCTGGAGGAATCAGTGTATCCACGCT +S ENSDART00000165342.1:746-851 GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGATGGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA +S ENSDART00000165342.1:854-886 TGCAGCCAAACAATGCAACTGTGACAGCAGCA +S ENSDART00000165342.1:899-953 TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA +S ENSDART00000165342.1:974-1097 CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAAGTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA +S ENSDART00000165342.1:1098-1175 TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACATATCCTGA +S ENSDART00000165342.1:1176-1324 CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAATCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACACCACAGCG L ENSDART00000161035.1:0-326 + ENSDART00000161035.1:397-472 + 71N L ENSDART00000161035.1:397-472 + ENSDART00000161035.1:477-523 + 5N L ENSDART00000165342.1:5-127 + ENSDART00000165342.1:125-304 + 2M diff --git a/tests/io/gfa1.py b/tests/io/gfa1.py index 6c7e3f4..002a4c8 100644 --- a/tests/io/gfa1.py +++ b/tests/io/gfa1.py @@ -29,8 +29,7 @@ "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" - "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", - "LN:i:326" + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" ]], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) @@ -44,84 +43,75 @@ "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", - "LN:i:326", ], [ "S", "ENSDART00000161035.1:397-472", "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" - "CAACA", - "LN:i:75", + "CAACA" ], [ "S", "ENSDART00000161035.1:477-523", - "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" ], [ "S", "ENSDART00000165342.1:5-127", "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" - "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG", "LN:i:122" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAAG" ], [ "S", "ENSDART00000165342.1:125-304", "AGGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" - "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" ], [ "S", "ENSDART00000165342.1:317-460", "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" - "CAG", "LN:i:143" + "CAG" ], [ "S", "ENSDART00000165342.1:459-592", "GTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" - "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA", - "LN:i:133" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAA" ], [ "S", "ENSDART00000165342.1:591-650", - "AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT", - "LN:i:59" + "AGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTGCCCT" ], [ "S", "ENSDART00000165342.1:645-746", "GCCCTTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" - "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", - "LN:i:101", + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT" ], [ "S", "ENSDART00000165342.1:746-851", "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" - "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" ], [ "S", "ENSDART00000165342.1:854-886", - "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", - "LN:i:32" + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA" ], [ "S", "ENSDART00000165342.1:899-953", - "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" ], [ "S", "ENSDART00000165342.1:974-1097", "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" - "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", - "LN:i:123" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" ], [ "S", "ENSDART00000165342.1:1098-1175", "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" - "ATCCTGA", - "LN:i:77", + "ATCCTGA" ], [ "S", "ENSDART00000165342.1:1176-1324", "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" - "CCACAGCG", - "LN:i:148" + "CCACAGCG" ]], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) @@ -134,85 +124,76 @@ "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" - "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", - "LN:i:326", + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" ], [ "S", "ENSDART00000161035.1:397-472", "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" - "CAACA", - "LN:i:75", + "CAACA" ], [ "S", "ENSDART00000161035.1:477-523", - "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" ], [ "S", "ENSDART00000165342.1:5-127", "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" - "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag", "LN:i:122" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGAag" ], [ "S", "ENSDART00000165342.1:125-304", "agGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" - "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" ], [ "S", "ENSDART00000165342.1:317-460", "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" - "CAg", "LN:i:143" + "CAg" ], [ "S", "ENSDART00000165342.1:459-592", "gTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" - "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa", - "LN:i:133" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAa" ], [ "S", "ENSDART00000165342.1:591-650", - "aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct", - "LN:i:59" + "aGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTgccct" ], [ "S", "ENSDART00000165342.1:645-746", "gccctTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" - "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", - "LN:i:101", + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT" ], [ "S", "ENSDART00000165342.1:746-851", "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" - "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" ], [ "S", "ENSDART00000165342.1:854-886", - "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", - "LN:i:32" + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA" ], [ "S", "ENSDART00000165342.1:899-953", - "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" ], [ "S", "ENSDART00000165342.1:974-1097", "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" - "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", - "LN:i:123" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" ], [ "S", "ENSDART00000165342.1:1098-1175", "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" - "ATCCTGA", - "LN:i:77", + "ATCCTGA" ], [ "S", "ENSDART00000165342.1:1176-1324", "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" - "CCACAGCG", - "LN:i:148" + "CCACAGCG" ]], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) @@ -225,85 +206,76 @@ "ATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGG" "AAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGAT" "AGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAAC" - "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA", - "LN:i:326", + "TGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA" ], [ "S", "ENSDART00000161035.1:397-472", "AGGAACTACGGTGGAGTGTATGTGGGTCTTCCTGCTGATCTGACTGCAGTCGCTGCCAGTCAGTCCAAAT" - "CAACA", - "LN:i:75", + "CAACA" ], [ "S", "ENSDART00000161035.1:477-523", - "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA", "LN:i:46" + "AGTCAACAGATGTTTATTGCAGACCTTCAGATAAAACAACATAGAA" ], [ "S", "ENSDART00000165342.1:5-127", "TGGAGCTGAAGCCGAGTATCTTGGTATTGGACTGGAACAGAAATCCAGCAAAAACTTTAAGGGAAATCAC" - "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN", "LN:i:122" + "TTTCATTTCATGATCGAAAAACTCCCGCAGATCATAAAAGAGTGGAAGGANN" ], [ "S", "ENSDART00000165342.1:125-304", "NNGACCTGTAGTAGAAACAAAACTAGGATCTCTGAGAGGTGCCTTCTTGACTGTGAAGGGCAAGGACACA" "ATAGTCAATAGTTATCTAGGTGTGCCGTTCGCCAAGCCGCCTGTAGGACCCCTGAGACTTGCTCGACCAC" - "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA", "LN:i:179" + "AGGCTGCAGAGAAATGGCAAGGAGTTAGAGATGCCACCA" ], [ "S", "ENSDART00000165342.1:317-460", "GTGCCTCCAGGAAAGGCAAATGACTGTAACTGAACTGGAGTTTCTATCGATGGATGTGGAGGTTCCTGAG" "GTCTCGGAGGATTGCCTGTATCTTAACATCTACACCCCAGTTAAACCTGGACAAGGAGACAAGAAGTTAC" - "CAN", "LN:i:143" + "CAN" ], [ "S", "ENSDART00000165342.1:459-592", "NTCATGGTTTGGATTCATGGTGGAGGACTCTCTCTTGGATCGGCTTCAATGTATGATGGCTCTGTTCTGG" - "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN", - "LN:i:133" + "CTGCGTATCAGGATGTGGTCGTGGTGCTCATTCAGTACAGATTGGGTCTTCTGGGGTTCTTAN" ], [ "S", "ENSDART00000165342.1:591-650", - "NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN", - "LN:i:59" + "NGCACCGGAGACGAGCATGCGCCAGGAAACTATGGTTTTCTGGATCAAGTAGCTNNNNN" ], [ "S", "ENSDART00000165342.1:645-746", "NNNNNTCAGTGGGTTCAGGAGAACATCCACAGCTTCGGTGGAGATCCTGGATCAGTGACCATCTTTGGAG" - "AGTCTGCTGGAGGAATCAGTGTATCCACGCT", - "LN:i:101", + "AGTCTGCTGGAGGAATCAGTGTATCCACGCT" ], [ "S", "ENSDART00000165342.1:746-851", "GATTCTTTCCCCGCTGGCGTCTGGACTGTTTCATCGCGCCATTGCAGAAAGTGGAACTGCCTTCTGGGAT" - "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA", "LN:i:105", + "GGTTTAGTCATGGCTGATCCTTTTCAGAGAGCCCA" ], [ "S", "ENSDART00000165342.1:854-886", - "TGCAGCCAAACAATGCAACTGTGACAGCAGCA", - "LN:i:32" + "TGCAGCCAAACAATGCAACTGTGACAGCAGCA" ], [ "S", "ENSDART00000165342.1:899-953", - "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA", "LN:i:54" + "TGTCGACTGCATTATGCACTGGTCTGAAGAGGAGGCTCTGGAATGTGCTAAAAA" ], [ "S", "ENSDART00000165342.1:974-1097", "CGTTGCTGTAGATTCTTATTTCCTTCCCAAACCCATCGAGGAGATTGTTGAGAAACAAGAGTTTAGTAAA" - "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA", - "LN:i:123" + "GTTCCTCTCATCAACGGCATTAACAATGATGAGTTTGGCTTCTTGTTGGCTGA" ], [ "S", "ENSDART00000165342.1:1098-1175", "TATTTCTTGGGTCCTGAATGGATGAATGGGTTGAAAAGAGAGCAAATCGCTGAAGCCTTGACGCTCACAT" - "ATCCTGA", - "LN:i:77", + "ATCCTGA" ], [ "S", "ENSDART00000165342.1:1176-1324", "CCCAAGGATCGATGGATCATTGATCTGGTGGCGAAGGAATATCTGGGCGACACACACGACCCCATTGAAA" "TCCGTGAAGTTTATCGGGAGATGATGGGAGACGTGCTGTTTAACATCCCTGCCCTGCAACTGGCAAAACA" - "CCACAGCG", - "LN:i:148" + "CCACAGCG" ]], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) diff --git a/tests/io/simple.gfa b/tests/io/simple.gfa index 488ea37..72827ac 100644 --- a/tests/io/simple.gfa +++ b/tests/io/simple.gfa @@ -1,4 +1,4 @@ H VN:Z:1.0 -S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA LN:i:326 +S ENSDART00000161035.1:0-326 TGCACGGGTTTATTGTTCACAAAGAGATCGACAATGTGCGCAACTAAAATAAACATAGTACATTTTGATTATACACGAACTTAAACTAAAGTCCAATCACACCTCCGCCCCGTTTCCACAGCAGCCTGTCAGGGTGGAGGAAAAGCGCGGCGGTCATGTGAGGCTCGAGCATCTCTCTCTCTCTCTCTCTCTCTCTCTCTACAGAATGATAGAGGGAGCTCGTGAATCACATCATAGTCGTCCTCCCCTCATTCGTCCTCTCCAGCAGACACCGAAAAACTGCGTTCATGCCAAAATGGGATGTGGAAATTCCTCCGCCACGAGCA C ENSDART00000161035.1 + ENSDART00000161035.1:0-326 + 0 326M P ENSDART00000161035.1 ENSDART00000161035.1:0-326+ * From 84ca2bbe1ed36740cdc8d0405cc045c08ae728b0 Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 19:11:27 +0100 Subject: [PATCH 42/45] Logging --- bin/build_baited_bloom_filter | 15 ++++++++--- bin/build_splice_graph | 19 +++----------- bin/gfa1_to_exons | 12 ++++----- bin/gfa1_to_gapped_transcripts | 12 ++++----- exfi/correct.py | 5 ++++ exfi/find_exons.py | 3 ++- exfi/io/bed4_to_gfa1.py | 29 ++++++++++++++------ exfi/io/fasta_to_dict.py | 4 +++ exfi/io/gfa1_to_bed.py | 7 +++++ exfi/io/gfa1_to_fasta.py | 32 ++++++++++++++++++----- exfi/io/gff3_to_bed.py | 25 +++++++++++++----- exfi/io/masking.py | 12 ++++++--- exfi/io/read_bed.py | 4 +++ exfi/io/read_gfa.py | 11 +++++++- exfi/polish.py | 48 +++++++++++++++++++++------------- 15 files changed, 163 insertions(+), 75 deletions(-) diff --git a/bin/build_baited_bloom_filter b/bin/build_baited_bloom_filter index 86959b3..92cd2e0 100755 --- a/bin/build_baited_bloom_filter +++ b/bin/build_baited_bloom_filter @@ -2,13 +2,15 @@ import argparse import logging -from exfi import __version__ -from exfi.build_baited_bloom_filter import build_baited_bloom_filter import sys from os.path import isfile, exists, dirname, abspath from os import \ makedirs, \ remove +from shutil import which + +from exfi import __version__ +from exfi.build_baited_bloom_filter import build_baited_bloom_filter parser = argparse.ArgumentParser( usage='build_baited_bloom_filter ' @@ -122,7 +124,10 @@ if __name__ == '__main__': # Set up logger logger = logging.getLogger() - logging.basicConfig(format='%(asctime)s\t%(module)s\t%(message)s', level=logging.ERROR) + logging.basicConfig( + format='%(asctime)s\t%(module)s\t%(message)s', + level=logging.ERROR + ) if args["verbose"]: logger.setLevel(logging.INFO) if args["debug"]: @@ -130,13 +135,14 @@ if __name__ == '__main__': # Check inputs + logging.info('Checking input parameters') assert args["kmer"] >= 1, 'ERROR: incorrect kmer size' # assert bloom_size assert args["levels"] >= 1, 'ERROR: incorrect number of levels' assert args["threads"] >= 1, 'ERROR: incorrect number of threads' # Check if programs are in path - from shutil import which + logging.info('Checking if biobloom* and abyss-bloom are in $PATH') assert which('biobloommaker') is not None, 'ERROR: biobloommaker not in PATH' assert which('biobloomcategorizer') is not None, 'ERROR: biobloomcategorizer not in PATH' assert which('abyss-bloom') is not None, 'ERROR: abyss-bloom not in PATH' @@ -147,6 +153,7 @@ if __name__ == '__main__': makedirs(output_dir) # Run the program + logging.info('Running build_baited_bloom_filter') build_baited_bloom_filter(args) logging.info("Done!") diff --git a/bin/build_splice_graph b/bin/build_splice_graph index 81b7b44..1952150 100755 --- a/bin/build_splice_graph +++ b/bin/build_splice_graph @@ -11,7 +11,6 @@ from os.path import \ from Bio import SeqIO from exfi import __version__ - from exfi.find_exons import find_exons from exfi.io.bed import bed3_to_bed4 from exfi.io.fasta_to_dict import fasta_to_dict @@ -19,6 +18,8 @@ from exfi.polish import polish_bed4 from exfi.correct import correct_bed4 from exfi.io.bed4_to_gfa1 import bed4_to_gfa1 + + parser = argparse.ArgumentParser( usage='build_splicegraph -i transcriptome.fa -b bloom_filter.bf -k 30 ' '-o exome.gfa', @@ -175,8 +176,7 @@ if __name__ == "__main__": logger.setLevel(logging.DEBUG) # Get predicted exons in bed format - bed3 = find_exons(args) - bed4 = bed3_to_bed4(bed3) + bed4 = bed3_to_bed4(find_exons(args)) # Transcriptome_dict transcriptome_dict = fasta_to_dict(args["fasta"]) @@ -189,22 +189,9 @@ if __name__ == "__main__": bed4=bed4, transcriptome_dict=transcriptome_dict, args=args ) - # if args["collapse"]: - # splice_graph_dict = { - # "collapsed": collapse_splice_graph_dict( - # splice_graph_dict=splice_graph_dict, - # transcriptome_dict=transcriptome_dict - # ) - # } - # Write to GFA1 bed4_to_gfa1( gfa1_fn=args["gfa1"], bed4=bed4, transcriptome_dict=transcriptome_dict ) - # splice_graph_dict_to_gfa1( - # splice_graph_dict=splice_graph_dict, - # transcriptome_dict=transcriptome_dict, - # filename=args["gfa1"] - # ) logging.info("Done!") diff --git a/bin/gfa1_to_exons b/bin/gfa1_to_exons index a43b3c0..e27600e 100755 --- a/bin/gfa1_to_exons +++ b/bin/gfa1_to_exons @@ -101,16 +101,16 @@ if __name__ == "__main__": if args["debug"]: logger.setLevel(logging.DEBUG) - # masking = "none" - # if args["soft_mask_overlaps"] == True: - # masking = "soft" - # if args["hard_mask_overlaps"] == True: - # masking = "hard" + masking = "none" + if args["soft_mask_overlaps"] == True: + masking = "soft" + if args["hard_mask_overlaps"] == True: + masking = "hard" gfa1_to_exons( fasta_out=args["fasta"], gfa1_in=args["gfa1"], - # masking=masking + masking=masking ) logging.info("Done!") diff --git a/bin/gfa1_to_gapped_transcripts b/bin/gfa1_to_gapped_transcripts index 6298403..75b13e1 100755 --- a/bin/gfa1_to_gapped_transcripts +++ b/bin/gfa1_to_gapped_transcripts @@ -114,17 +114,17 @@ if __name__ == "__main__": if args["debug"]: logger.setLevel(logging.DEBUG) - # masking = "none" - # if args["soft_mask_overlaps"] == True: - # masking = "soft" - # if args["hard_mask_overlaps"] == True: - # masking = "hard" + masking = "none" + if args["soft_mask_overlaps"] == True: + masking = "soft" + if args["hard_mask_overlaps"] == True: + masking = "hard" gfa1_to_gapped_transcripts( fasta_out=args["fasta"], gfa1_in=args["gfa1"], gap_size=args["number_of_ns"], - # masking=masking + masking=masking ) logging.info("Done!") diff --git a/exfi/correct.py b/exfi/correct.py index 7ee22b2..b7c7ed5 100755 --- a/exfi/correct.py +++ b/exfi/correct.py @@ -2,6 +2,7 @@ """exfi.new_correct.py: fill small overlaps and gaps with abyss-sealer""" +import logging from tempfile import \ mkstemp @@ -161,11 +162,15 @@ def correct_bed4(bed4, transcriptome_dict, args): """Inspect the bed4 for small gaps and overlaps, write a fasta file for sealer, and correct the bed4. """ + logging.info('Preparing abyss-sealer') sealer_input_fn = prepare_sealer( bed4=bed4, transcriptome_dict=transcriptome_dict, args=args ) + logging.info('Running abyss-sealer') sealer_output_fn = run_sealer(sealer_input_fn=sealer_input_fn, args=args) + logging.info('Collecting abyss-sealer\'s results') sealer_results = collect_sealer_results(filename=sealer_output_fn) + logging.info('Applying correction to BED4') bed4_corrected = apply_correction_to_bed4(bed4, sealer_results) os.remove(sealer_input_fn) os.remove(sealer_output_fn) diff --git a/exfi/find_exons.py b/exfi/find_exons.py index 79fcbff..fc89fbb 100644 --- a/exfi/find_exons.py +++ b/exfi/find_exons.py @@ -18,7 +18,7 @@ def process_output(process): - """Get lines in bed format from the output of a Popen. + """Get lines in BED3 format from the output of a Popen. :param Popen process: Popen object. """ @@ -89,4 +89,5 @@ def find_exons(args): p_filter = Popen(c_filter, stdin=p_merge1.stdout, stdout=PIPE) p_merge2 = Popen(c_merge2, stdin=p_filter.stdout, stdout=PIPE) p_kmers.stdout.close() + logging.info('Done') return process_output(p_merge2) diff --git a/exfi/io/bed4_to_gfa1.py b/exfi/io/bed4_to_gfa1.py index 1576412..080d287 100644 --- a/exfi/io/bed4_to_gfa1.py +++ b/exfi/io/bed4_to_gfa1.py @@ -3,6 +3,8 @@ """exfi.io.bed4_to_gfa1.py: submodule to write a BED4 dataframe to GFA1 format """ +import logging + import pandas as pd from exfi.io.bed import \ @@ -17,6 +19,7 @@ def compute_header(): """Write GFA1 header""" + logging.info('Computing the header') header = pd.DataFrame( data=[["H", "VN:Z:1.0"]], columns=HEADER_COLS @@ -26,47 +29,52 @@ def compute_header(): def compute_segments(bed4, transcriptome_dict, masking='none'): """Create the Segments subdataframe for GFA1 file""" + logging.info('Computing node2sequence') segments = bed4_to_node2sequence( bed4=bed4, transcriptome_dict=transcriptome_dict ) + logging.info('Computing edge2overlap') edge2overlap = bed4_to_edge2overlap(bed4) + logging.info('Masking') segments = mask( node2sequence=segments, edge2overlap=edge2overlap, masking=masking ) del edge2overlap - # Add the S and length columns + logging.info('Adding the record_type') segments["record_type"] = "S" - # reorder - segments = segments\ - [SEGMENT_COLS] - - return segments + return segments[SEGMENT_COLS] def compute_links(bed4): """Compute the Links subdataframe of a GFA1 file.""" + logging.info('Computing edge2overlap') links = bed4_to_edge2overlap(bed4=bed4)\ .rename(columns={'u': 'from', 'v': 'to'}) + logging.info('Adding record_type, from_orient, to_orient') links["record_type"] = "L" links["from_orient"] = "+" links["to_orient"] = "+" + logging.info('Computing the overlap between exons') links["overlap"] = links.overlap.map(lambda x: str(x) + "M" if x >= 0 else str(-x) + "N") - links = links[LINK_COLS] - return links + logging.info('Reordering') + return links[LINK_COLS] def compute_containments(bed4): """Create the minimal containments subdataframe""" containments = bed4.copy() + logging.info('Adding record_type, container, container_orient, contained, ' + 'contained_orient, and pos') containments["record_type"] = "C" containments["container"] = containments["chrom"] containments["container_orient"] = "+" containments["contained"] = containments["name"] containments["contained_orient"] = "+" containments["pos"] = containments["chrom_start"] + logging.info('Computing the overlap') containments["overlap"] = containments["chrom_end"] - containments["chrom_start"] containments["overlap"] = containments.overlap.map(lambda x: str(x) + "M") containments = containments.drop( @@ -95,16 +103,21 @@ def compute_paths(bed4): def bed4_to_gfa1(gfa1_fn, bed4, transcriptome_dict, masking='none'): """Convert the BED4 dataframe into a GFA1 file""" with open(gfa1_fn, "w", 1024**3) as gfa: + logging.info('Writing the header') compute_header()\ .to_csv(gfa, sep="\t", header=False, index=False) with open(gfa1_fn, "a", 1024**3) as gfa: + logging.info('Writing the segments') compute_segments( bed4=bed4, transcriptome_dict=transcriptome_dict, masking=masking )\ .to_csv(gfa, sep="\t", header=False, index=False) + logging.info('Writing the links') compute_links(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) + logging.info('Writing the containments') compute_containments(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) + logging.info('Writing the paths') compute_paths(bed4=bed4)\ .to_csv(gfa, sep="\t", header=False, index=False) diff --git a/exfi/io/fasta_to_dict.py b/exfi/io/fasta_to_dict.py index a82c2e8..3306796 100644 --- a/exfi/io/fasta_to_dict.py +++ b/exfi/io/fasta_to_dict.py @@ -2,6 +2,8 @@ """exfi.io.fasta_to_dict.py: submodule to convert a fasta file into a dict""" +import logging + from Bio.SeqIO.FastaIO import \ SimpleFastaParser @@ -13,8 +15,10 @@ def fasta_to_dict(filename): :param filename: str: Path to the fasta file """ + logging.info('Dumping fasta to dict') with open(filename, "r") as handle: return { identifier.split()[0]: sequence for identifier, sequence in SimpleFastaParser(handle) } + logging.info('Done') diff --git a/exfi/io/gfa1_to_bed.py b/exfi/io/gfa1_to_bed.py index 9003ec3..07e6bea 100644 --- a/exfi/io/gfa1_to_bed.py +++ b/exfi/io/gfa1_to_bed.py @@ -3,6 +3,8 @@ """exfi.io.gfa1_to_bed.py: submodule to read a GFA1 file and convert it to BED4 """ +import logging + import numpy as np from exfi.io.bed import BED4_COLS, BED4_DTYPES @@ -11,16 +13,21 @@ def gfa1_to_bed4(filename): """Read a GFA1 file and convert it to BED4""" + logging.info('Converting GFA1 to BED4') containments = read_gfa1(filename)['containments'] + logging.info('Renaming columns') containments = containments.rename(columns={ "container": "chrom", "contained": "name" }) + logging.info('Overlap to int') containments["overlap"] = containments\ .overlap.map(lambda x: np.int(x[:-1])) + logging.info('Computing coordinates') containments["chrom_start"] = containments["pos"] containments["chrom_end"] = containments["pos"] + containments["overlap"] containments = containments[BED4_COLS] containments = containments.astype(BED4_DTYPES) + logging.info('Done') return containments diff --git a/exfi/io/gfa1_to_fasta.py b/exfi/io/gfa1_to_fasta.py index 52cd84b..da2c4b7 100644 --- a/exfi/io/gfa1_to_fasta.py +++ b/exfi/io/gfa1_to_fasta.py @@ -3,95 +3,115 @@ """exfi.io.gfa1_to_exons.py: submodule to read a gfa1, extract the exons and store it in fasta format""" +import logging + from exfi.io.read_gfa import read_gfa1 from exfi.io.masking import mask, cigar_to_int def gfa1_to_exons(fasta_out, gfa1_in, masking='none'): """Extract the exons in Fasta format""" + + logging.info('Converting GFA1 to exons in fasta format') + with open(fasta_out, "w") as fasta: + logging.info('Reading the GFA1 file') gfa1 = read_gfa1(gfa1_in) + logging.info('Computing node2sequence') node2sequence = gfa1['segments']\ .drop(columns='record_type') if node2sequence.shape[0] == 0: return + logging.info('Computing edge2overlap') edge2overlap = gfa1['links']\ .drop(columns=['record_type', 'from_orient', 'to_orient'])\ .rename(columns={ 'from': 'u', 'to': 'v', 'overlap': 'overlap_cigar' }) + logging.info('Computing overlap from CIGAR to int') edge2overlap['overlap'] = edge2overlap.overlap_cigar.map(cigar_to_int) + logging.info('Masking (if necessary)') node2sequence = mask( node2sequence=node2sequence, edge2overlap=edge2overlap, masking=masking ) + logging.info('Composing fasta sequences') node2sequence["fasta"] = \ ">" + node2sequence["name"] + "\n" + \ node2sequence["sequence"] + logging.info('Dumping fasta to disk') node2sequence.fasta.values.tofile(fasta, sep="\n", format="%s") fasta.write("\n") # Final end line - + logging.info('Done') def gfa1_to_gapped_transcripts( fasta_out, gfa1_in, gap_size=100, masking='none'): """Convert a GFA1 file to a gapped transcript file""" + logging.info('Converting GFA1 to gapped transcripts in fasta format') + with open(fasta_out, "w") as fasta: separator = gap_size * 'N' + logging.info('Reading the GFA1 file') gfa1 = read_gfa1(gfa1_in) - # Segments -> node2sequence + logging.info('Computing node2sequence') node2sequence = gfa1['segments']\ .drop(columns=["record_type"]) if node2sequence.shape[0] == 0: return + logging.info('Computing edge2overlap') edge2overlap = gfa1['links']\ .drop(columns=['record_type', 'from_orient', 'to_orient'])\ .rename(columns={ 'from': 'u', 'to': 'v', 'overlap': 'overlap_cigar' }) + logging.info('Computing overlap CIGAR to int') edge2overlap["overlap"] = edge2overlap.overlap_cigar.map(cigar_to_int) + logging.info('Computing path2nodes') path2nodes = gfa1['paths']\ .drop(columns=['record_type', 'overlaps']) path2nodes.segment_names = path2nodes\ .segment_names.str.replace('+', '') - # Mask the sequences + logging.info('Masking sequences (if necessary)') node2sequence = mask( node2sequence=node2sequence, edge2overlap=edge2overlap, masking=masking ) + logging.info('Converting node2sequence to dict') node2sequence_dict = node2sequence\ .set_index('name')\ .to_dict()['sequence'] - # Compose the sequence + logging.info('Composing the gapped sequence') path2nodes["gapped_sequence"] = path2nodes\ .segment_names\ .str.split(',')\ .map(lambda x: separator.join([node2sequence_dict[y] for y in x])) - # Create the fasta line + logging.info('Composing fasta sequences') path2nodes["fasta"] = \ ">" + path2nodes.path_name + " " + path2nodes.segment_names + \ "\n" + \ path2nodes.gapped_sequence - # Dump everything + logging.info('Dumping fasta to disk') path2nodes.fasta.values.tofile(fasta, sep="\n", format="%s") fasta.write("\n") # Final end line + logging.info('Done') diff --git a/exfi/io/gff3_to_bed.py b/exfi/io/gff3_to_bed.py index 0db9019..9fd5bf0 100644 --- a/exfi/io/gff3_to_bed.py +++ b/exfi/io/gff3_to_bed.py @@ -5,10 +5,18 @@ import sys +import logging + import pandas as pd from exfi.io.bed import BED3_COLS, BED3_DTYPES +GFF3_COLS = [ + "seqid", "source", "type", "start", "end", "score", "strand", "phase", + "attributes" +] + + def gff3_to_bed3(gff3_in, mode="ensembl"): """Read a GFF3 file and convert it to BED3, where coordinates are with respect to the transcriptome @@ -19,11 +27,7 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): - "ncbi": for GFF3 files downloaded from NCBI Genomes """ - gff3_columns = [ - "seqid", "source", "type", "start", "end", "score", "strand", "phase", - "attributes" - ] - + logging.info("Reading GFF3 file") raw = pd.read_csv( sep='\t', na_values=".", @@ -31,7 +35,7 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): filepath_or_buffer=gff3_in, comment="#", header=None, - names=gff3_columns, + names=GFF3_COLS, low_memory=False # Convert types at the end. Seqid is char, not int ) @@ -40,6 +44,7 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): exons = exons.astype(BED3_DTYPES) return exons + logging.info('Extracting the transcript ids') if mode == "gmap": exons = raw[raw['type'] == 'cDNA_match'].drop(columns='type') exons['transcript_id'] = exons['attributes']\ @@ -59,6 +64,8 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): exons = exons[['transcript_id', 'strand', 'start', 'end']] + logging.info('Reordering exons by strand') + positive = ( exons [exons['strand'] == '+'] @@ -78,7 +85,10 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): merged = pd.concat([positive, negative]) + + logging.info('Computing lengths') merged['length'] = merged['end'] - merged['start'] + 1 + logging.info('Computing ends') merged['transcript_end'] = ( merged .groupby('transcript_id') @@ -86,8 +96,10 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): .cumsum() ) + logging.info('Computing starts') merged['transcript_start'] = merged['transcript_end'] - merged['length'] + logging.info('Tidying up') merged = merged[['transcript_id', 'transcript_start', 'transcript_end']] merged = merged.rename(columns={ @@ -100,4 +112,5 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): merged = merged.reset_index(drop=True) + logging.info('Done') return merged diff --git a/exfi/io/masking.py b/exfi/io/masking.py index 93515df..8b4a744 100644 --- a/exfi/io/masking.py +++ b/exfi/io/masking.py @@ -73,12 +73,16 @@ def mask(node2sequence, edge2overlap, masking: str = "none"): :param str masking: Type of masking to apply. Options: hard, soft, none (Default value = "None") . """ + + logging.info('Masking sequences') + if masking == 'none': return node2sequence # Compose a dataframe of name, sequence, bases to trim to the left # and bases to trim to the right + logging.info('Computing bases to trim to the right and to the left') complete = node2sequence.merge( edge2overlap[['u', 'overlap']]\ .rename(columns={'u': 'name', 'overlap': 'mask_right'}), @@ -93,27 +97,29 @@ def mask(node2sequence, edge2overlap, masking: str = "none"): .fillna(0)\ .astype({'mask_right': np.int64, 'mask_left':np.int64}) - # Set to zero overlaps < 0 + logging.info('Removing negative masking') complete['mask_right'] = complete.mask_right\ .map(lambda x: x if x > 0 else 0) complete['mask_left'] = complete.mask_left\ .map(lambda x: x if x > 0 else 0) if masking == "hard": - logging.info("\tHard masking sequences") + logging.info("Hard masking sequences") complete['sequence'] = complete.apply( lambda x: hard_mask(x.sequence, x.mask_left, x.mask_right), axis=1 ) elif masking == "soft": - logging.info("\tSoft masking sequences") + logging.info("Soft masking sequences") complete['sequence'] = complete.apply( lambda x: soft_mask(x.sequence, x.mask_left, x.mask_right), axis=1 ) + logging.info('Tidying up') node2sequence_masked = complete\ [['name', 'sequence']]\ .reset_index(drop=True) + logging.info('Done') return node2sequence_masked diff --git a/exfi/io/read_bed.py b/exfi/io/read_bed.py index d7e306e..d4d85f2 100644 --- a/exfi/io/read_bed.py +++ b/exfi/io/read_bed.py @@ -2,6 +2,8 @@ """exfi.io.read_bed.py: BED importer""" +import logging + import pandas as pd from exfi.io.bed import BED3_COLS, BED3_DTYPES @@ -9,6 +11,7 @@ def read_bed3(filename): """Read a BED file and return the BED3 dataframe.""" + logging.info('Reading BED3 from disk') bed3 = pd.read_csv( filepath_or_buffer=filename, header=None, @@ -17,4 +20,5 @@ def read_bed3(filename): names=BED3_COLS, engine='c' ).astype(BED3_DTYPES) + logging.info('Done') return bed3 diff --git a/exfi/io/read_gfa.py b/exfi/io/read_gfa.py index f542fae..07a7a30 100644 --- a/exfi/io/read_gfa.py +++ b/exfi/io/read_gfa.py @@ -2,7 +2,7 @@ """exfi.io.read_gfa.py: submodule to read GFA1 files""" - +import logging import pandas as pd from exfi.io.gfa1 import \ @@ -15,38 +15,47 @@ def read_gfa1(gfa1_fn): keys are header, segments, links, containments, and paths. Values are DataFrames, with the exception of the header""" + logging.info('Reading a GFA1 from disk') + with open(gfa1_fn, 'r') as gfa: gfa1 = {} + logging.info('Reading raw data') data = [ x.strip().split("\t") for x in gfa.readlines() if x[0] in set(['H', 'S', 'L', 'C', 'P']) ] + logging.info('Processing header lines') gfa1['header'] = pd.DataFrame( data=[x[0:2] for x in data if x[0] == 'H'], columns=HEADER_COLS ).astype(HEADER_DTYPES) + logging.info('Processing segment lines') gfa1['segments'] = pd.DataFrame( data=[x[0:3] for x in data if x[0] == "S"], columns=SEGMENT_COLS ).astype(SEGMENT_DTYPES) + logging.info('Processing link lines') gfa1['links'] = pd.DataFrame( data=[x[0:6] for x in data if x[0] == 'L'], columns=LINK_COLS ).astype(LINK_DTYPES) + logging.info('Processing containment lines') gfa1['containments'] = pd.DataFrame( data=[x[0:7] for x in data if x[0] == 'C'], columns=CONTAINMENT_COLS ).astype(CONTAINMENT_DTYPES) + logging.info('Processing path lines') gfa1['paths'] = pd.DataFrame( data=[x[0:4] for x in data if x[0] == 'P'], columns=PATH_COLS ).astype(PATH_DTYPES) + logging.info('Done') return gfa1 diff --git a/exfi/polish.py b/exfi/polish.py index 77a4ba2..0507000 100755 --- a/exfi/polish.py +++ b/exfi/polish.py @@ -4,64 +4,72 @@ two exons there is the AG-GT splicing signal. """ +import logging + def polish_bed4(bed4, transcriptome_dict): """ Trim overlapping exons according to the AG-GT signal. """ + logging.info("Polishing BED4") + polished = bed4.copy() - # Get the transcript_id of the next exon + logging.info("Get the transcript_id of the next exon") polished["chrom_next"] = polished["chrom"].shift(-1) - # Get the name of the next exon + logging.info('Get the name of the next exon') polished["name_next"] = polished["name"].shift(-1) - # Get the start of the next exon + logging.info('Get the start of the next exon') polished["chrom_start_next"] = polished["chrom_start"].shift(-1) - # Get the end of the next exon + logging.info('Get the end of the next exon') polished["chrom_end_next"] = polished["chrom_end"].shift(-1) - # Remove rows with different transcripts + logging.info('Remove rows with different transcripts') polished = polished\ [polished["chrom"] == polished["chrom_next"]] - # cast from float to int - polished = polished.astype({"chrom_start_next": int, "chrom_end_next": int}) + logging.info('Cast from float to int (just in case)') + polished = polished.astype({ + "chrom_start_next": int, + "chrom_end_next": int + }) - # compute the overlap + logging.info('Compute the overlap') polished["overlap"] = polished["chrom_end"] - polished["chrom_start_next"] - # Throw away lines that cannot be polished + logging.info('Throw away lines that cannot be polished') polished = polished[polished.overlap >= 4] - # Get the entire transcript sequence + logging.info('Get the entire transcript sequence') polished["sequence"] = polished.chrom.map(transcriptome_dict) - # Prepare a column with the data required to extract the overlapping seq + logging.info('Prepare a column with the data required to extract the ' + 'overlapping sequence') polished["data_to_map"] = list(zip( polished.sequence, polished.chrom_start_next + 1, polished.chrom_end + 1 )) - # Get the overlapping sequence + logging.info('Get the overlapping sequence') polished["overlap_str"] = polished\ .data_to_map\ .map(lambda x: x[0][x[1]:x[2]]) - # Get the position in which the AGGT happens + logging.info('Get the position in which the AGGT happens') polished["overlap_index"] = polished["overlap_str"].str.rfind("AGGT") - # Throw away rows in which AGGT doesn't happen + logging.info('Throw away rows in which AGGT doesn\'t happen') polished = polished[polished.overlap_index >= 0] - # Correct positions + logging.info('Correct positions') polished["chrom_end_corrected"] = polished["chrom_end"] - 2 polished["chrom_start_next_corrected"] = \ polished["chrom_start_next"] + polished["overlap_index"] + 2 - # Organize the elements to correct + logging.info('Organize the elements to correct') ends_to_change = polished\ [["name", "chrom_end_corrected"]]\ .rename({"chrom_end_corrected": "chrom_end"}, axis=1)\ @@ -78,17 +86,21 @@ def polish_bed4(bed4, transcriptome_dict): bed4_new = bed4.set_index("name") - # Correct the starts + + logging.info('Correct the starts') bed4_new.loc[starts_to_change.index.tolist()].chrom_start = \ starts_to_change.chrom_start - # Correct the ends + logging.info('Correct the ends') bed4_new.loc[ends_to_change.index.tolist()].chrom_end = \ ends_to_change.chrom_end + logging.info('Compose the new names') bed4_new = bed4_new.reset_index(drop=False) bed4_new["name"] = \ bed4_new.chrom + ":" + \ bed4_new.chrom_start.map(str) + "-" + \ bed4_new.chrom_end.map(str) + logging.info('Done') + return bed4_new[["chrom", "chrom_start", "chrom_end", "name"]] From 2059d5d958770dcd6cfaf645584a54e93ce4a53b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Fri, 25 Jan 2019 19:15:44 +0100 Subject: [PATCH 43/45] Message won't display --- exfi/io/fasta_to_dict.py | 1 - 1 file changed, 1 deletion(-) diff --git a/exfi/io/fasta_to_dict.py b/exfi/io/fasta_to_dict.py index 3306796..e688b9b 100644 --- a/exfi/io/fasta_to_dict.py +++ b/exfi/io/fasta_to_dict.py @@ -21,4 +21,3 @@ def fasta_to_dict(filename): identifier.split()[0]: sequence for identifier, sequence in SimpleFastaParser(handle) } - logging.info('Done') From 0521eceacaedb54ed9e86a7c00405c58cef2f60b Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 28 Jan 2019 10:52:59 +0100 Subject: [PATCH 44/45] Removed absurd case --- exfi/io/gff3_to_bed.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/exfi/io/gff3_to_bed.py b/exfi/io/gff3_to_bed.py index 9fd5bf0..849e354 100644 --- a/exfi/io/gff3_to_bed.py +++ b/exfi/io/gff3_to_bed.py @@ -59,9 +59,6 @@ def gff3_to_bed3(gff3_in, mode="ensembl"): sys.exit("Unknown mode") - if exons.shape[0] == 0: - return exons - exons = exons[['transcript_id', 'strand', 'start', 'end']] logging.info('Reordering exons by strand') From 2383648a0f107de92ec47dd0977661514f8024dc Mon Sep 17 00:00:00 2001 From: Jorge Langa Date: Mon, 28 Jan 2019 10:59:22 +0100 Subject: [PATCH 45/45] Version bump --- exfi/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exfi/__init__.py b/exfi/__init__.py index 9975e27..acea98e 100644 --- a/exfi/__init__.py +++ b/exfi/__init__.py @@ -3,4 +3,4 @@ filters. """ -__version__ = '1.5.0' +__version__ = '1.4.13'