forked from framazan/bwtandem
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_search.py
More file actions
69 lines (54 loc) · 2 KB
/
debug_search.py
File metadata and controls
69 lines (54 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
"""Debug script to test motif search"""
import sys
sys.path.insert(0, '.')
from bwt import BWTCore, Tier1STRFinder, MotifUtils
# Test sequence
seq = "AAAAAAAAAAATATATATATATATATGGGGGGGGGGCACACACACACACACACATTTTTTTTTTAGCAGCAGCAGCAGCAGCCCCCCCCCCCAGTCAGTCAGTCAGTCAGTCNNNNNNNNNNATGCATGCATGCATGCATGC"
print(f"Sequence length: {len(seq)}")
print(f"Sequence: {seq[:50]}...")
print()
# Build BWT
seq_with_sentinel = seq + '$'
bwt = BWTCore(seq_with_sentinel, 32)
print(f"BWT built successfully")
print()
# Test motif search
test_motifs = ["AT", "CA", "AGC", "AGTC", "ATGC"]
for motif in test_motifs:
print(f"Testing motif: {motif}")
# Check entropy
entropy = MotifUtils.calculate_entropy(motif)
print(f" Entropy: {entropy:.3f} bits (min=1.0)")
# Check if primitive
is_prim = MotifUtils.is_primitive_motif(motif)
print(f" Primitive: {is_prim}")
# Search for motif
positions = bwt.get_kmer_positions(motif)
print(f" Positions found: {len(positions)}")
print(f" Positions: {positions}")
# Check rotations
rotations = [motif[i:] + motif[:i] for i in range(len(motif))]
rc_motif = MotifUtils.reverse_complement(motif)
rc_rotations = [rc_motif[i:] + rc_motif[:i] for i in range(len(rc_motif))]
all_rots = list(set(rotations + rc_rotations))
print(f" All rotations: {all_rots}")
all_pos = []
for rot in all_rots:
rot_pos = bwt.get_kmer_positions(rot)
print(f" {rot}: {len(rot_pos)} positions")
all_pos.extend(rot_pos)
unique_pos = sorted(set(all_pos))
print(f" Total unique positions: {len(unique_pos)}")
print()
# Now run Tier1 finder
print("="*60)
print("Running Tier1STRFinder...")
print("="*60)
tier1 = Tier1STRFinder(bwt, max_motif_length=9, allow_mismatches=True)
tier1.min_copies = 3
tier1.min_entropy = 1.0
repeats = tier1.find_strs("synthetic_chromosome")
print(f"\nFound {len(repeats)} repeats:")
for r in repeats:
print(f" {r.start}-{r.end}: {r.motif} x {r.copies:.1f} ({r.length}bp)")