Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
colbyford authored Jul 19, 2024
2 parents e6f09f0 + f813abd commit aa4a7de
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
4 changes: 2 additions & 2 deletions evodiff/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def __getitem__(self, idx):
slice_start = 0
seq_len = msa_seq_len

sliced_msa = msa[:, slice_start: slice_start + self.max_seq_len]
sliced_msa = msa[:, slice_start: slice_start + seq_len]
anchor_seq = sliced_msa[0] # This is the query sequence in MSA

sliced_msa = [list(seq) for seq in sliced_msa if (list(set(seq)) != [self.tokenizer.alphabet.index(GAP)])]
Expand All @@ -271,7 +271,7 @@ def __getitem__(self, idx):
anchor_seq = np.expand_dims(anchor_seq, axis=0)
output = np.concatenate((anchor_seq, sliced_msa[random_idx]), axis=0)
elif self.selection_type == 'non-random':
output = sliced_msa[:64]
output = sliced_msa[:self.n_sequences]
elif self.selection_type == "MaxHamming":
output = [list(anchor_seq)]
msa_subset = sliced_msa[1:]
Expand Down
26 changes: 22 additions & 4 deletions evodiff/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
import evodiff
import numpy as np
from sequence_models.constants import MASK, MSA_PAD, MSA_ALPHABET, MSA_AAS, GAP, START, STOP
from sequence_models.constants import MASK, MSA_PAD, MSA_ALPHABET, MSA_AAS, GAP, START, STOP, SEP
from evodiff.constants import BLOSUM_ALPHABET
from sklearn.preprocessing import normalize
import itertools
Expand Down Expand Up @@ -149,14 +149,15 @@ def parse_fasta(seq_file, idx):
class Tokenizer(object):
"""Convert between strings and index"""
def __init__(self, protein_alphabet=MSA_ALPHABET, pad=MSA_PAD, mask=MASK, all_aas=MSA_AAS, gap=GAP, start=START,
stop=STOP, path_to_blosum=None, sequences=False):
stop=STOP, sep=SEP, path_to_blosum=None, sequences=False):
self.alphabet = list("".join(protein_alphabet))
self.all_aas = list("".join(all_aas))
self.pad = pad
self.mask = mask
self.gap = gap
self.start = start
self.stop = stop
self.sep = sep
self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
self.i_to_a = np.array(self.alphabet)
if path_to_blosum is not None:
Expand Down Expand Up @@ -188,6 +189,10 @@ def start_id(self):
def stop_id(self):
return self.tokenize(self.stop)[0]

@property
def sep_id(self):
return self.tokenize(self.sep)[0]

def q_blosum(self):
q = np.array([i for i in self.matrix_dict.values()])
q = q.reshape((len(self.all_aas),len(self.all_aas)))
Expand Down Expand Up @@ -445,7 +450,7 @@ def run_tmscore(fpath, pdb, num_seqs, path_to_tmscore='TMscore', amlt=False, rer
else:
#if reres:
print("NOT USING RERES") # Manually switching between reference and re-res for multi-chain PDB TMscores - add more efficient approach
ref_path = os.path.join(out_fpath, pdb + '_reference.pdb')
ref_path = os.path.join(out_fpath, pdb + '_reres.pdb')
#else:
# ref_path = os.path.join(out_fpath, pdb + '_reference.pdb')
print(ref_path)
Expand Down Expand Up @@ -521,7 +526,20 @@ def eval_disopred_output(out_fpath, ref_df, prefix='', num_seqs=100):
return mean_gen_score #, mean_og_score



import Bio
from Bio.PDB import PDBParser
import numpy as np
def get_bfactor(filename):
parser=PDBParser(PERMISSIVE=1)
protein = parser.get_structure('A', filename)#'generated/100/pdb/SEQUENCE_0.pdb')
b_factors = []
for model in protein:
for chain in model:
for residue in chain:
for atom in residue:
b_factors.append(atom.get_bfactor())
b_factors = np.array(b_factors)
return b_factors, b_factors.mean()



Expand Down

0 comments on commit aa4a7de

Please sign in to comment.