Skip to content

Commit 5a18033

Browse files
committed
Finish codon-based trimming
1 parent b2bb0da commit 5a18033

File tree

6 files changed

+153
-16
lines changed

6 files changed

+153
-16
lines changed

clipkit/api.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def clipkit(
3939

4040
# override some options not currently available through programmatic interface
4141
complement = False
42+
codon = False
4243
use_log = False
4344
quiet = True
4445

@@ -51,6 +52,7 @@ def clipkit(
5152
gaps,
5253
gap_characters,
5354
complement,
55+
codon,
5456
TrimmingMode(mode),
5557
use_log,
5658
quiet,

clipkit/clipkit.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class TrimRun:
4747
input_file_format: FileFormat
4848
output_file_format: FileFormat
4949
gaps: float
50+
codon: bool
5051
version: str = current_version
5152

5253
@property
@@ -67,6 +68,7 @@ def run(
6768
gaps: float,
6869
gap_characters: Union[list, None],
6970
complement: bool,
71+
codon: bool,
7072
mode: TrimmingMode,
7173
use_log: bool,
7274
quiet: bool,
@@ -99,7 +101,7 @@ def run(
99101
gaps = smart_gap_threshold_determination(alignment, gap_characters)
100102

101103
msa = create_msa(alignment, gap_characters)
102-
msa.trim(mode, gap_threshold=gaps)
104+
msa.trim(mode, gap_threshold=gaps, site_positions_to_trim=None, codon=codon)
103105

104106
trim_run = TrimRun(
105107
alignment,
@@ -109,6 +111,7 @@ def run(
109111
input_file_format,
110112
output_file_format,
111113
gaps,
114+
codon,
112115
)
113116

114117
return trim_run, msa.stats
@@ -123,6 +126,7 @@ def execute(
123126
gaps: float,
124127
gap_characters: Union[list, None],
125128
complement: bool,
129+
codon: bool,
126130
mode: TrimmingMode,
127131
use_log: bool,
128132
quiet: bool,
@@ -151,6 +155,7 @@ def execute(
151155
gaps,
152156
gap_characters,
153157
complement,
158+
codon,
154159
mode,
155160
use_log,
156161
quiet,
@@ -168,6 +173,7 @@ def execute(
168173
trim_run.gap_characters,
169174
mode,
170175
complement,
176+
codon,
171177
use_log,
172178
)
173179

clipkit/msa.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from Bio.Seq import Seq
33
from Bio.SeqRecord import SeqRecord
44
import numpy as np
5+
from itertools import chain
56
from typing import Union
67

78
from .modes import TrimmingMode
@@ -113,7 +114,12 @@ def trim(
113114
site_positions_to_trim = np.array(site_positions_to_trim)
114115
if not isinstance(site_positions_to_trim, np.ndarray):
115116
raise ValueError("site_positions_to_trim must be a list or np array")
116-
self._site_positions_to_trim = site_positions_to_trim
117+
118+
self._site_positions_to_trim = (
119+
self.determine_all_codon_sites_to_trim(site_positions_to_trim)
120+
if codon is True
121+
else site_positions_to_trim
122+
)
117123
else:
118124
self._site_positions_to_trim = self.determine_site_positions_to_trim(
119125
mode,
@@ -210,10 +216,8 @@ def determine_site_positions_to_trim(self, mode, gap_threshold, codon=False):
210216
Example:
211217
[2, 9] -> [1, 2, 3, 7, 8, 9]
212218
"""
213-
sites_to_trim = map(
214-
self.determine_codon_triplet_positions, sites_to_trim
215-
).flatten()
216-
print(sites_to_trim)
219+
return self.determine_all_codon_sites_to_trim(sites_to_trim)
220+
217221
return sites_to_trim
218222

219223
def generate_debug_log_info(self):
@@ -234,10 +238,31 @@ def generate_debug_log_info(self):
234238
gappyness,
235239
)
236240

237-
def determine_codon_triplet_positions(alignment_position):
241+
def determine_all_codon_sites_to_trim(self, sites_to_trim):
242+
"""
243+
For each position in sites_to_trim we need the full triplet of codon positions.
244+
245+
Sites to trim -> all codon sites to trim
246+
[2, 8] -> [0, 1, 2, 6, 7, 8]
247+
"""
248+
sites_to_trim_codon = [
249+
self.determine_codon_triplet_positions(site_pos)
250+
for site_pos in sites_to_trim
251+
]
252+
flattened_unique_sites = list(set(chain(*sites_to_trim_codon)))
253+
return np.array(flattened_unique_sites)
254+
255+
def determine_codon_triplet_positions(self, alignment_position):
256+
"""
257+
Block 0 -> [0,1,2], block 1 -> [3,4,5]
258+
259+
We filter to make sure we are not including any positions out of range
260+
"""
238261
block = alignment_position // 3
239-
remainder = alignment_position % 3
240-
if remainder:
241-
block += 1
242-
codon_triplet_index = block * 3
243-
return [codon_triplet_index - 2, codon_triplet_index - 1, codon_triplet_index]
262+
codon_triplet_index_start = block * 3
263+
sites = [
264+
codon_triplet_index_start,
265+
codon_triplet_index_start + 1,
266+
codon_triplet_index_start + 2,
267+
]
268+
return list(filter(lambda x: x <= self._original_length - 1, sites))

clipkit/write.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def write_user_args(
3434
gap_chars: list,
3535
mode: "TrimmingMode",
3636
complement: bool,
37+
codon: bool,
3738
use_log: bool,
3839
) -> None:
3940
if seq_type.value == "nt":
@@ -57,6 +58,7 @@ def write_user_args(
5758
Gap characters: {gap_chars}
5859
Trimming mode: {mode.value}
5960
Create complementary output: {complement}
61+
Process as codons: {codon}
6062
Create log file: {use_log}
6163
""" # noqa
6264
)
@@ -77,9 +79,9 @@ def write_output_files_message(
7779
------------------------
7880
| Writing output files |
7981
------------------------
80-
trimmed alignment: {out_file_name}
81-
complement file: {out_file_name + '.complement' if complement else False}
82-
log file: {out_file_name + '.log' if use_log else False}
82+
Trimmed alignment: {out_file_name}
83+
Complement file: {out_file_name + '.complement' if complement else False}
84+
Log file: {out_file_name + '.log' if use_log else False}
8385
"""
8486
)
8587
)

tests/integration/test_codon.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def test_simple_codon(self):
2626
sequence_type=None,
2727
complement=False,
2828
codon=True,
29-
gaps=0.2,
29+
gaps=0.8,
3030
mode=TrimmingMode.gappy,
3131
use_log=False,
3232
gap_characters=DEFAULT_NT_GAP_CHARS,
@@ -42,3 +42,33 @@ def test_simple_codon(self):
4242
output_content = out_file.read()
4343

4444
assert expected_content == output_content
45+
46+
def test_simple_codon_all_trimmed(self):
47+
"""
48+
test codon
49+
usage: clipkit simple.fa -co
50+
"""
51+
output_file = "output/simple.fa_gappy_codon"
52+
kwargs = dict(
53+
input_file=f"{here.parent}/samples/simple.fa",
54+
output_file=output_file,
55+
input_file_format="fasta",
56+
output_file_format="fasta",
57+
sequence_type=None,
58+
complement=False,
59+
codon=True,
60+
gaps=0.1,
61+
mode=TrimmingMode.gappy,
62+
use_log=False,
63+
gap_characters=DEFAULT_NT_GAP_CHARS,
64+
quiet=True,
65+
)
66+
67+
execute(**kwargs)
68+
69+
expected_content = ">1\n>2\n>3\n>4\n>5\n"
70+
71+
with open(output_file, "r") as out_file:
72+
output_content = out_file.read()
73+
74+
assert expected_content == output_content

tests/unit/test_msa.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,75 @@ def test_trim_by_provided_site_positions_list(self):
6262
]
6363
)
6464
np.testing.assert_equal(msa.sites_kept, expected_sites_kept)
65+
66+
@pytest.mark.parametrize(
67+
"sites_to_trim, expected",
68+
[
69+
(
70+
[0],
71+
np.array(
72+
[
73+
["T", "A", "T"],
74+
["-", "A", "T"],
75+
["-", "T", "A"],
76+
["-", "T", "A"],
77+
["-", "T", "-"],
78+
]
79+
),
80+
),
81+
(
82+
[2],
83+
np.array(
84+
[
85+
["T", "A", "T"],
86+
["-", "A", "T"],
87+
["-", "T", "A"],
88+
["-", "T", "A"],
89+
["-", "T", "-"],
90+
]
91+
),
92+
),
93+
(
94+
[3],
95+
np.array(
96+
[
97+
["A", "-", "G"],
98+
["A", "-", "G"],
99+
["A", "-", "G"],
100+
["A", "G", "A"],
101+
["A", "C", "a"],
102+
]
103+
),
104+
),
105+
(
106+
[5],
107+
np.array(
108+
[
109+
["A", "-", "G"],
110+
["A", "-", "G"],
111+
["A", "-", "G"],
112+
["A", "G", "A"],
113+
["A", "C", "a"],
114+
]
115+
),
116+
),
117+
(
118+
[0, 1, 2, 3, 4, 5],
119+
np.array(
120+
[
121+
[],
122+
[],
123+
[],
124+
[],
125+
[],
126+
],
127+
dtype=object,
128+
),
129+
),
130+
],
131+
)
132+
def test_trim_codons(self, sites_to_trim, expected):
133+
bio_msa = get_biopython_msa("tests/unit/examples/simple.fa")
134+
msa = MSA.from_bio_msa(bio_msa)
135+
msa.trim(site_positions_to_trim=sites_to_trim, codon=True)
136+
np.testing.assert_equal(msa.trimmed, expected)

0 commit comments

Comments
 (0)