Skip to content

Commit

Permalink
240808
Browse files Browse the repository at this point in the history
  • Loading branch information
ssocean committed Aug 8, 2024
1 parent b7d438d commit fdb1b67
Show file tree
Hide file tree
Showing 7 changed files with 4,345 additions and 1 deletion.
15 changes: 14 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,22 @@ OMP_NUM_THREADS=1 accelerate launch offcial_train.py \
Then, type `sh train.sh` in the console. Wating for the training ends~

## Testing (batch)
Similar to Fine-tuning, prepare `test.sh` as below:
```
python inference.py \
--data_path ScImpactPredict/NAID/NAID_test_extrainfo.csv \
--weight_dir path_to_runs_dir
```
Then, type `sh test.sh`.

## Testing (single article)
Just modified the `single_pred.py` file, then type `python single_pred.py`.
Just modified the `single_pred.py` file, then type `python single_pred.py`.

## Model Weights
First, download the LLaMA-3 pretrain weights at huggingface official sites.
Then, download the provided LoRA weights (runs_dir) [here](https://drive.google.com/file/d/13-ugXsm35AuzOBUlL6jPacY_z8qVIb7x/view?usp=sharing).

## Compare with Previos Methods
With a few adjustments based on your specific needs, it should work fine. Since these models train very quickly (less than few minutes on a single RTX 3080), we won’t be providing the trained models.

### We are pretty confident in our methodology and experiments, and you should be able to achieve any of the performance reported in our paper.
3,628 changes: 3,628 additions & 0 deletions TKPD/TKPD.csv

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions TKPD/prompt_keyword_async_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from retry import retry
import Levenshtein
from tqdm import tqdm

API_SECRET_KEY = "sk-xxx"
BASE_URL = "xxx"
os.environ["OPENAI_API_KEY"] = API_SECRET_KEY
os.environ["OPENAI_API_BASE"] = BASE_URL
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_openai import ChatOpenAI

def normalized_edit_distance(str1, str2):

str1 = str1.strip().lower()
str2 = str2.strip().lower()


edit_distance = Levenshtein.distance(str1, str2)


max_length = max(len(str1), len(str2))


normalized_distance = edit_distance / max_length if max_length != 0 else 0
return normalized_distance
usr_prompts = ["Given the following title and abstract of the research paper, identify the core task or problem being addressed in few words. You MUST respond with the keyphrase ONLY in this format: xxx",
"Based on the given title and abstract, what is the main focus or task of the research? Summarize it in a few words. You MUST respond with the keyphrase ONLY in this format: xxx",
"Analyze the title and abstract provided to identify the central task or topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as certain model names, unless widely recognized.). You MUST respond with the keyword ONLY in this format: xxx"
]
@retry(delay=2)
def get_chatgpt_field(title, abstract=None, sys_content=None, usr_prompt=None, extra_prompt=True, model="gpt-3.5-turbo-0125", temperature=0):
if not sys_content:
sys_content = (
"You are a profound researcher who is good at identifying the topic key phrase from paper's title and "
"abstract. Ensure that the topic key phrase precisely defines the research area within the article. For effective academic searching, such as on Google Scholar, the field should be specifically targeted rather than broadly categorized. For instance, use 'image classification' instead of the general 'computer vision' to enhance relevance and searchability of related literature.")
if not usr_prompt:
usr_prompt = ("Analyze the title and abstract provided to identify the central topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as obscure model names, unless widely recognized). Focus on a keyword that reflects the innovative aspect or core methodology of the study. You MUST respond with the keyword ONLY in this format: xxx")

messages = [SystemMessage(content=sys_content)]

extra_abs_content = '''
Given Title: Large Selective Kernel Network for Remote Sensing Object Detection
Given Abstract: Recent research on remote sensing object detection has largely focused on improving the representation of oriented bounding boxes but has overlooked the unique prior knowledge presented in remote sensing scenarios. Such prior knowledge can be useful because tiny remote sensing objects may be mistakenly detected without referencing a sufficiently long-range context, which can vary for different objects. This paper considers these priors and proposes the lightweight Large Selective Kernel Network (LSKNet). LSKNet can dynamically adjust its large spatial receptive field to better model the ranging context of various objects in remote sensing scenarios. To our knowledge, large and selective kernel mechanisms have not been previously explored in remote sensing object detection. Without bells and whistles, our lightweight LSKNet sets new state-of-the-art scores on standard benchmarks, i.e., HRSC2016 (98.46% mAP), DOTA-v1.0 (81.85% mAP), and FAIR1M-v1.0 (47.87% mAP).''' if abstract else ''
if extra_prompt:
messages += [HumanMessage(content=f'''{usr_prompt}\n\n{extra_abs_content}'''), AIMessage(content='remote sensing object detection')]

content = f'''{usr_prompt}\n
Given Title: {title}
'''
if abstract:
content += f'Given Abstract: {abstract}'
messages.append(HumanMessage(content=content))

chat = ChatOpenAI(model=model, temperature=temperature)

return chat.batch([messages])[0].content
import csv
from multiprocessing import Pool
prompt = "Identify the research field from the given title and abstract. You MUST respond with the keyword ONLY in this format: xxx"
def process_row(row):
title, abs, GT_kwd = row[0], row[1], row[2]
pred_kwd = get_chatgpt_field(title, abs, usr_prompt=prompt) # This should be replaced with the actual prediction logic
# Assuming normalized_edit_distance is defined elsewhere

ned = normalized_edit_distance(GT_kwd, pred_kwd)
print(f'GT:{GT_kwd} \t Pred:{pred_kwd} \t Ned:{ned}')
return ned

def main():
with open(r'TKPD.csv','r', newline='', encoding='gbk') as input_csvfile:
reader = csv.reader(input_csvfile)
rows = [row for row in reader]
print(len(rows))
with Pool(12) as p:
results = p.map(process_row, rows)

average_distance = sum(results) / len(results) if results else 0
print(f"{prompt}: {average_distance}")

if __name__ == '__main__':
main()
250 changes: 250 additions & 0 deletions previous_methods/Doc2Vec&Bi-LSTM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from torch.utils.data import DataLoader
import torch.nn as nn
from sklearn.model_selection import train_test_split
import nltk
from tqdm import tqdm

nltk.download('punkt')
nltk.download('omw-1.4')

# def dcg_at_k(scores, k):
# """
# scores: a list of relevance scores in predicted order
# k: number of results to consider
# """
# scores = np.asfarray(scores)[:k]
# return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))
#
# def ndcg_at_k(predicted_scores, true_scores, k):
# """
# predicted_scores: model's predicted scores
# true_scores: ground truth scores
# k: number of results to consider
# """
# idcg = dcg_at_k(sorted(true_scores, reverse=True), k)
# dcg = dcg_at_k(predicted_scores, k)
# return dcg / idcg if idcg > 0 else 0

import torch
import numpy as np

from sklearn.metrics import ndcg_score

def NDCG_k(predictions, labels, k=20):
print(print(predictions.shape, labels.shape))
predictions = predictions.squeeze().detach().cpu().numpy()
labels = labels.squeeze().detach().cpu().numpy()
if len(predictions) < k:
return -1


ndcg = ndcg_score([labels], [predictions], k=k)

print("Average NDCG:", ndcg)
return ndcg

# nltk.download('stopwords')
# nltk.download('wordnet')


def preprocess_text(text):
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


text = text.lower()
text = ''.join([char for char in text if char not in string.punctuation])
words = nltk.word_tokenize(text)
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

return words



class PapersDataset(Dataset):
def __init__(self, dataframe, doc2vec_model,target_type='TNCSI'):
"""
dataframe: 传入的DataFrame,包含训练或测试数据
doc2vec_model: 已训练的Doc2Vec模型
"""
self.dataframe = dataframe
self.doc2vec_model = doc2vec_model
self.target_type = target_type
def __len__(self):
return len(self.dataframe)

def __getitem__(self, idx):
row = self.dataframe.iloc[idx]
metadata = f"{row['title']} {row['abstract']}"
processed_text = preprocess_text(metadata)
vector = self.doc2vec_model.infer_vector(processed_text)
if self.target_type.startswith('TNCSI'):
label = row[self.target_type]
else:
label = row['cites']
return torch.tensor(vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


def train_doc2vec(documents):
tagged_data = [TaggedDocument(words=preprocess_text(doc), tags=[i]) for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs=40)
return model



class Attention(nn.Module):
def __init__(self, hidden_size):
super(Attention, self).__init__()
self.linear = nn.Linear(hidden_size, 1)

def forward(self, lstm_output):

weights = torch.tanh(self.linear(lstm_output))
weights = torch.softmax(weights, dim=1)

weighted = torch.mul(lstm_output, weights.expand_as(lstm_output))

return torch.sum(weighted, dim=1)

class CitationModel(nn.Module):
def __init__(self, embedding_dim, hidden_dim,target_type='TNCSI'):
super(CitationModel, self).__init__()
self.bi_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
self.attention = Attention(hidden_dim * 2)
self.fc = nn.Linear(hidden_dim * 2, 1)
self.sigmoid = nn.Sigmoid()
self.relu = nn.ReLU()
self.target_type = target_type

def forward(self, x):
lstm_out, _ = self.bi_lstm(x)
attention_out = self.attention(lstm_out)
output = self.fc(attention_out)
if self.target_type.startswith('TNCSI'):
output = self.sigmoid(output)
else:
output = self.relu(output)
return output



def train_model(model, train_loader, criterion, optimizer, epochs, device):
model.train()
loss_history = []
for epoch in tqdm(range(epochs)):
total_loss = 0
for inputs,targets in train_loader:
inputs, targets = inputs.to(device).unsqueeze(1), targets.to(device).unsqueeze(1)
optimizer.zero_grad()
outputs = model(inputs)

loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
loss_history.append(avg_loss)
print(f'Epoch {epoch+1}, Loss: {avg_loss}')
# else:
# for inputs,_,targets in train_loader:

# optimizer.zero_grad()
# outputs = model(inputs)
#
# loss = criterion(outputs, targets)
# loss.backward()
# optimizer.step()
# total_loss += loss.item()
# avg_loss = total_loss / len(train_loader)
# loss_history.append(avg_loss)
# print(f'Epoch {epoch + 1}, Loss: {avg_loss}')

print("Training complete. Loss history:")
print(loss_history)

# Evaluation function for NDCG
def evaluate_model(model, test_loader, device,k=20):
model.eval()
pred_scores = []
target_scores = []
with torch.no_grad():
total_loss = 0


for inputs, target in test_loader:
inputs = inputs.to(device).unsqueeze(1) # Ensure input is correctly shaped
outputs = model(inputs)

# Flatten outputs and targets for NDCG computation
predicted_scores = outputs.squeeze()
true_scores = target.squeeze()
loss = nn.MSELoss()(predicted_scores.detach().cpu(),true_scores.detach().cpu())
total_loss += loss.item()
print(predicted_scores)
print(true_scores)
print('-'*50)
pred_scores.append(outputs)
target_scores.append(true_scores)

avg_loss = total_loss / len(test_loader)
print(f'AVG MSE:{avg_loss}')

all_pred = torch.cat(pred_scores, dim=0).squeeze()
all_GT = torch.cat(target_scores, dim=0).squeeze()

# all_pred = torch.Tensor(pred_scores)
# all_GT = torch.Tensor(target_scores)
ndcg = NDCG_k(all_pred, all_GT,k=k)
print(ndcg)

return ndcg
# Main function
def main():
csv_file = r' Desktop\NAID_train_extrainfo.csv'
target_type = 'TNCSI_SP'

train_data = pd.read_csv(csv_file)
test_data = pd.read_csv(r' Desktop\NAID_test_extrainfo.csv')

# Train the Doc2Vec model on training data abstracts
train_documents = train_data['abstract'].tolist()
doc2vec_model = train_doc2vec(train_documents)

# Create training and testing datasets

train_dataset = PapersDataset(dataframe=train_data, doc2vec_model=doc2vec_model,target_type=target_type)
test_dataset = PapersDataset(dataframe=test_data, doc2vec_model=doc2vec_model,target_type=target_type)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = CitationModel(embedding_dim=100, hidden_dim=1024,target_type=target_type).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.MSELoss()

ep = 30

train_model(model, train_loader, criterion, optimizer, epochs=ep, device=device)
torch.save(model.state_dict(), f'LSTM-{target_type}-{ep}.pth')

# model.load_state_dict(torch.load(f'LSTM-{target_type}-{ep}.pth'))

# Evaluate using NDCG
evaluate_model(model, test_loader, device=device)

if __name__ == '__main__':
main()
Loading

0 comments on commit fdb1b67

Please sign in to comment.