Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 70 additions & 53 deletions prot2mol/produce_molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,114 +10,131 @@
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings("ignore")
from datasets import load_from_disk
from transformers import GenerationConfig
from transformers import AutoTokenizer, GPT2LMHeadModel
from transformers.utils import logging
import re
import numpy as np
import multiprocessing as mp
from protein_encoders import get_protein_encoder, get_protein_tokenizer
from utils_fps import generate_morgan_fingerprints_parallel

logging.set_verbosity_error()
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



def load_dataset(path, prot_emb_model, prot_id):

print("Loading required data")
train_data = pd.read_csv(f"{path}/train.csv")
alphabet = list(sf.get_alphabet_from_selfies(list(train_data.Compound_SELFIES)))
tokenizer.add_tokens(alphabet)
train_vec = np.load(f"{path}/train_vecs.npy")

eval_data = pd.read_csv(f"{path}/eval.csv")
alphabet = list(sf.get_alphabet_from_selfies(list(eval_data.Compound_SELFIES)))
tokenizer.add_tokens(alphabet)
del eval_data

test_data = pd.read_csv(f"{path}/test_{prot_id}.csv")
alphabet = list(sf.get_alphabet_from_selfies(list(test_data.Compound_SELFIES)))
tokenizer.add_tokens(alphabet)


target_data = load_from_disk(prot_emb_model)
selected_target = test_data[test_data["Target_CHEMBL_ID"].isin([args.prot_id])].reset_index(drop=True)

return train_data, train_vec, target_data, selected_target

def get_fasta(dataset, prot_id):
test_df = pd.read_csv(dataset)
fasta = test_df.loc[test_df.Target_CHEMBL_ID == prot_id, "Target_FASTA"].iloc[0]
fasta = re.sub(r"[UZOB]", "X", fasta)

return fasta

def prepare_protein_train_vecs(selfies_csv, prot_id):
df = pd.read_csv(selfies_csv)
df = df[df["Target_CHEMBL_ID"] == prot_id].reset_index(drop=True)

def get_target(target, target_id):
selfies_list = df["Compound_SELFIES"].tolist()
smiles_list = [sf.decoder(s) for s in selfies_list]

enc_state = target[target["Target_CHEMBL_ID"].index(target_id)]["encoder_hidden_states"]
sample = {"encoder_hidden_states": enc_state, "target_chembl_id": target_id}
return sample
fps = generate_morgan_fingerprints_parallel(
smiles_list,
radius=2,
nBits=1024,
n_jobs= min(mp.cpu_count()-1, 10)
)
np.save("vec_for_prod.npy", fps)
print("vector file saved successfully")
return df, fps

def generate_molecules(data):
generated_tokens = model.generate(encoder_hidden_states=data,
generated_tokens = model.generate(generation_config=generation_config,
encoder_hidden_states=data,
num_return_sequences=args.bs,
do_sample=True,
max_length=200,
pad_token_id=1,
pad_token_id=tokenizer.pad_token_id,
output_attentions = True if args.attn_output else False)

generated_selfies = [tokenizer.decode(x, skip_special_tokens=True) for x in generated_tokens]
return generated_selfies

print("Starting to generate molecules.")

def generation_loop(target_data, num_samples, bs):
def generation_loop(encoder_hidden, num_samples, bs):

gen_mols = []
sample = get_target(target_data, args.prot_id)["encoder_hidden_states"].view(1,-1,1024).to("cuda:0")

for _ in tqdm.tqdm(range(int(num_samples/bs))):
gen_mols.extend(generate_molecules(sample))
gen_mols.extend(generate_molecules(encoder_hidden))

gen_mols_df = pd.DataFrame(gen_mols, columns=["Generated_SELFIES"])

return gen_mols_df

print("Metrics are being calculated.")

def calc_metrics(dataset, prot_emb_model, prot_id, num_samples, bs, generated_mol_file):

train_data, train_vec, target_data, selected_target = load_dataset(dataset, prot_emb_model=prot_emb_model, prot_id=prot_id)

gen_mols_df = generation_loop(target_data, num_samples, bs)
def calc_metrics(train_df, train_vec, selected_target_df, gen_mols_df, generated_mol_file):
#train df tüm dataset olcak
#gen_mols_df = generation_loop(target_data, num_samples, bs)

metrics, generated_smiles = metrics_calculation(predictions=gen_mols_df["Generated_SELFIES"],
references=selected_target["Compound_SELFIES"],
train_data = train_data,
metrics, results_df = metrics_calculation(predictions=gen_mols_df["Generated_SELFIES"],
references=selected_target_df["Compound_SELFIES"],
train_data = train_df,
train_vec = train_vec,
training=False)
print(metrics)

gen_mols_df["smiles"] = generated_smiles
gen_mols_df["smiles"] = results_df["smiles"]
gen_mols_df.to_csv(generated_mol_file, index=False)

results_df.to_csv(generated_mol_file.replace(".csv", "_per_sample_results.csv"), index = False)

with open(generated_mol_file.replace(".csv", "_metrics.json"), "w") as f:
json.dump(metrics, f)
print("Molecules and metrics are saved.")


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("--model_file", default="./finetuned_models/set_100_finetuned_model/checkpoint-3100", help="Path of the pretrained model file.")
parser.add_argument("--prot_emb_model", default="./data/prot_embed/prot_t5/prot_comp_set_pchembl_None_protlen_None/embeddings", help="Path of the pretrained model file.")
parser.add_argument("--prot_emb_model", default="saprot", help="Encoder selection: prot_t5, esm2, saprot")
parser.add_argument("--generated_mol_file", default="./saved_mols/_kt_finetune_mols.csv", help="Path of the output embeddings file.")
parser.add_argument("--selfies_path", default='./data/papyrus/prot_comp_set_pchembl_None_protlen_500_human_False', help="Path of the input SEFLIES dataset.")
parser.add_argument("--attn_output", default=False, help="Path of the output embeddings file.")
parser.add_argument("--prot_id", default="CHEMBL4282", help="Target Protein ID.")
parser.add_argument("--num_samples", default=10000, help="Sample number.")
parser.add_argument("--bs", default=100, help="Batch size.")
parser.add_argument("--num_samples", type = int, default=10000, help="Sample number.")
parser.add_argument("--bs", type = int, default=100, help="Batch size.")
args = parser.parse_args()



genearted_mol_file_path = f"""./saved_mols/
{args.selfies_path.split("/")[2]}_
{args.prot_id}_
{args.model_file.split("/")[2]}_
{args.prot_emb_model.split("/")[3]}/
{args.num_samples}_mols.csv"""
# Load tokenizer and the model
print("Loading model and tokenizer")
tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large", padding_side="left") # we can convert this to our own tokenizer later.
model_name = args.model_file
model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda:0")
generation_config = GenerationConfig.from_pretrained(model_name)
model.config.bos_token_id = 1
model.config.bos_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
encoder = get_protein_encoder(args.prot_emb_model, active = False)
prot_tokenizer = get_protein_tokenizer(args.prot_emb_model)

fasta = get_fasta(args.selfies_path, args.prot_id)

enc_inputs = prot_tokenizer(fasta, return_tensors="pt", truncation = True,
padding = "max_length", max_length = encoder.max_length)
with torch.no_grad():
encoder_hidden = encoder.encode(sequences=enc_inputs["input_ids"],
attention_mask = enc_inputs["attention_mask"]).to(model.device)

prot_df, train_vec = prepare_protein_train_vecs(args.selfies_path, args.prot_id)

gen_mols_df = generation_loop(encoder_hidden, args.num_samples, args.bs)

calc_metrics(args.selfies_path, args.prot_emb_model, args.prot_id, args.num_samples, args.bs, genearted_mol_file_path)
calc_metrics(train_df = pd.read_csv(args.selfies_path), train_vec = train_vec, selected_target_df=prot_df,
gen_mols_df=gen_mols_df, generated_mol_file=args.generated_mol_file)