Skip to content

Latest commit

 

History

History
106 lines (77 loc) · 2.62 KB

README.md

File metadata and controls

106 lines (77 loc) · 2.62 KB

InfluenzaSLM

Here are the steps I did:

  1. Started an interactive session
interact --mem=10g
  1. Load Python/3.10.4-GCCcore-11.3.0 with
ml Python/3.10.4-GCCcore-11.3.0
  1. Create a virtual python environment with
python -m venv ~/env/genslm
  1. Activate this env
.   ~/env/genslm/bin/activate
  1. Install GenSLM in this env with
pip install git+https://github.com/ramanathanlab/genslm
  1. Then, I modified embeddings.py python file forour dataset.

embedding.py is the file I used

import torch
import numpy as np
from torch.utils.data import DataLoader
from genslm import GenSLM, SequenceDataset
from Bio import SeqIO

# Initialize GenSLM model with a valid model_id that matches your model's architecture

model_path = '/scratch/ss11645/GenSLM/MLProject/models/patric_25m_epoch01-val_loss_0.57_bias_removed.pt'
model = GenSLM('genslm_25M_patric')  # This sets up the architecture

custom_model_state = torch.load(model_path, map_location=torch.device('cpu'))

# If the .pt file contains the model state under a specific key, adjust the key accordingly
model.load_state_dict(custom_model_state)

model.eval()  # Prepare the model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load and prepare your data
fasta_file = '/scratch/ss11645/GenSLM/MLProject/Seperated_files/h3n2.64000.fasta'
sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, 'fasta')]

# Prepare dataset and dataloader
dataset = SequenceDataset(sequences, model.seq_length, model.tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Compute embeddings
embeddings = []
with torch.no_grad():
    for batch in dataloader:
        outputs = model(batch["input_ids"].to(device), batch["attention_mask"].to(device), output_hidden_states=True)
        emb = outputs.hidden_states[-1].detach().cpu().numpy()
        emb = np.mean(emb, axis=2)
        embeddings.append(emb)

# Concatenate all embeddings
embeddings = np.concatenate(embeddings, axis=0)

# Output the shape of the embeddings array
print(embeddings.shape)
  1. I submitted the job using emb.sh in Sapelo2 cluster
#!/bin/bash
#SBATCH --job-name=embeddings
#SBATCH --partition=bahl_p
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem=2gb
#SBATCH --cpus-per-task=4
#SBATCH --time=500:00:00
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
#SBATCH --mail-user=ss11645@uga.edu
#SBATCH --mail-type=END,FAIL    #Mail events (NONE, BEGIN, END, FAIL, ALL)

cd $SLURM_SUBMIT_DIR

ml Python/3.10.4-GCCcore-11.3.0 
.   ~/env/genslm/bin/activate
python /scratch/ss11645/GenSLM/embeddings1.py