Skip to content

Commit

Permalink
fixed write_tf_records for very short sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
LarsGab committed Jan 26, 2025
1 parent fa77ea0 commit d38a878
Showing 1 changed file with 2 additions and 7 deletions.
9 changes: 2 additions & 7 deletions bin/write_tfrecord_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,13 @@ def load_clamsa_data(clamsa_prefix, seq_names, seq_len=None):
return np.concatenate([clamsa_chunks[::-1,::-1, [1,0,3,2]], clamsa_chunks], axis=0)

def get_species_data_hmm(genome_path='', annot_path='', species='', seq_len=500004, overlap_size=0, transition=False):

if not genome_path:
genome_path = f'/home/gabriell/deepl_data/genomes/{species}.fa.combined.masked'
if not annot_path:
annot_path=f'/home/gabriell//deepl_data/annot_longest_fixed/{species}.gtf'

fasta = GenomeSequences(fasta_file=genome_path,
chunksize=seq_len,
overlap=overlap_size)
fasta.encode_sequences()
seqs = [len(s) for s in fasta.sequences]
seq_names = fasta.sequence_names
seq_names = [seq_n for seq, seq_n in zip(fasta.sequence, fasta.sequence_names) \
if len(seq)>seq_len]
f_chunk, _, _ = fasta.get_flat_chunks(strand='+', pad=False)
del fasta
print(f_chunk.shape)
Expand Down

0 comments on commit d38a878

Please sign in to comment.