-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
4,345 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import os | ||
import csv | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
from retry import retry | ||
import Levenshtein | ||
from tqdm import tqdm | ||
|
||
API_SECRET_KEY = "sk-xxx" | ||
BASE_URL = "xxx" | ||
os.environ["OPENAI_API_KEY"] = API_SECRET_KEY | ||
os.environ["OPENAI_API_BASE"] = BASE_URL | ||
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage | ||
from langchain_openai import ChatOpenAI | ||
|
||
def normalized_edit_distance(str1, str2): | ||
|
||
str1 = str1.strip().lower() | ||
str2 = str2.strip().lower() | ||
|
||
|
||
edit_distance = Levenshtein.distance(str1, str2) | ||
|
||
|
||
max_length = max(len(str1), len(str2)) | ||
|
||
|
||
normalized_distance = edit_distance / max_length if max_length != 0 else 0 | ||
return normalized_distance | ||
usr_prompts = ["Given the following title and abstract of the research paper, identify the core task or problem being addressed in few words. You MUST respond with the keyphrase ONLY in this format: xxx", | ||
"Based on the given title and abstract, what is the main focus or task of the research? Summarize it in a few words. You MUST respond with the keyphrase ONLY in this format: xxx", | ||
"Analyze the title and abstract provided to identify the central task or topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as certain model names, unless widely recognized.). You MUST respond with the keyword ONLY in this format: xxx" | ||
] | ||
@retry(delay=2) | ||
def get_chatgpt_field(title, abstract=None, sys_content=None, usr_prompt=None, extra_prompt=True, model="gpt-3.5-turbo-0125", temperature=0): | ||
if not sys_content: | ||
sys_content = ( | ||
"You are a profound researcher who is good at identifying the topic key phrase from paper's title and " | ||
"abstract. Ensure that the topic key phrase precisely defines the research area within the article. For effective academic searching, such as on Google Scholar, the field should be specifically targeted rather than broadly categorized. For instance, use 'image classification' instead of the general 'computer vision' to enhance relevance and searchability of related literature.") | ||
if not usr_prompt: | ||
usr_prompt = ("Analyze the title and abstract provided to identify the central topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as obscure model names, unless widely recognized). Focus on a keyword that reflects the innovative aspect or core methodology of the study. You MUST respond with the keyword ONLY in this format: xxx") | ||
|
||
messages = [SystemMessage(content=sys_content)] | ||
|
||
extra_abs_content = ''' | ||
Given Title: Large Selective Kernel Network for Remote Sensing Object Detection | ||
Given Abstract: Recent research on remote sensing object detection has largely focused on improving the representation of oriented bounding boxes but has overlooked the unique prior knowledge presented in remote sensing scenarios. Such prior knowledge can be useful because tiny remote sensing objects may be mistakenly detected without referencing a sufficiently long-range context, which can vary for different objects. This paper considers these priors and proposes the lightweight Large Selective Kernel Network (LSKNet). LSKNet can dynamically adjust its large spatial receptive field to better model the ranging context of various objects in remote sensing scenarios. To our knowledge, large and selective kernel mechanisms have not been previously explored in remote sensing object detection. Without bells and whistles, our lightweight LSKNet sets new state-of-the-art scores on standard benchmarks, i.e., HRSC2016 (98.46% mAP), DOTA-v1.0 (81.85% mAP), and FAIR1M-v1.0 (47.87% mAP).''' if abstract else '' | ||
if extra_prompt: | ||
messages += [HumanMessage(content=f'''{usr_prompt}\n\n{extra_abs_content}'''), AIMessage(content='remote sensing object detection')] | ||
|
||
content = f'''{usr_prompt}\n | ||
Given Title: {title} | ||
''' | ||
if abstract: | ||
content += f'Given Abstract: {abstract}' | ||
messages.append(HumanMessage(content=content)) | ||
|
||
chat = ChatOpenAI(model=model, temperature=temperature) | ||
|
||
return chat.batch([messages])[0].content | ||
import csv | ||
from multiprocessing import Pool | ||
prompt = "Identify the research field from the given title and abstract. You MUST respond with the keyword ONLY in this format: xxx" | ||
def process_row(row): | ||
title, abs, GT_kwd = row[0], row[1], row[2] | ||
pred_kwd = get_chatgpt_field(title, abs, usr_prompt=prompt) # This should be replaced with the actual prediction logic | ||
# Assuming normalized_edit_distance is defined elsewhere | ||
|
||
ned = normalized_edit_distance(GT_kwd, pred_kwd) | ||
print(f'GT:{GT_kwd} \t Pred:{pred_kwd} \t Ned:{ned}') | ||
return ned | ||
|
||
def main(): | ||
with open(r'TKPD.csv','r', newline='', encoding='gbk') as input_csvfile: | ||
reader = csv.reader(input_csvfile) | ||
rows = [row for row in reader] | ||
print(len(rows)) | ||
with Pool(12) as p: | ||
results = p.map(process_row, rows) | ||
|
||
average_distance = sum(results) / len(results) if results else 0 | ||
print(f"{prompt}: {average_distance}") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,250 @@ | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import torch | ||
from torch.utils.data import Dataset | ||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | ||
import nltk | ||
from nltk.corpus import stopwords | ||
from nltk.stem import WordNetLemmatizer | ||
import string | ||
from torch.utils.data import DataLoader | ||
import torch.nn as nn | ||
from sklearn.model_selection import train_test_split | ||
import nltk | ||
from tqdm import tqdm | ||
|
||
nltk.download('punkt') | ||
nltk.download('omw-1.4') | ||
|
||
# def dcg_at_k(scores, k): | ||
# """ | ||
# scores: a list of relevance scores in predicted order | ||
# k: number of results to consider | ||
# """ | ||
# scores = np.asfarray(scores)[:k] | ||
# return np.sum(scores / np.log2(np.arange(2, scores.size + 2))) | ||
# | ||
# def ndcg_at_k(predicted_scores, true_scores, k): | ||
# """ | ||
# predicted_scores: model's predicted scores | ||
# true_scores: ground truth scores | ||
# k: number of results to consider | ||
# """ | ||
# idcg = dcg_at_k(sorted(true_scores, reverse=True), k) | ||
# dcg = dcg_at_k(predicted_scores, k) | ||
# return dcg / idcg if idcg > 0 else 0 | ||
|
||
import torch | ||
import numpy as np | ||
|
||
from sklearn.metrics import ndcg_score | ||
|
||
def NDCG_k(predictions, labels, k=20): | ||
print(print(predictions.shape, labels.shape)) | ||
predictions = predictions.squeeze().detach().cpu().numpy() | ||
labels = labels.squeeze().detach().cpu().numpy() | ||
if len(predictions) < k: | ||
return -1 | ||
|
||
|
||
ndcg = ndcg_score([labels], [predictions], k=k) | ||
|
||
print("Average NDCG:", ndcg) | ||
return ndcg | ||
|
||
# nltk.download('stopwords') | ||
# nltk.download('wordnet') | ||
|
||
|
||
def preprocess_text(text): | ||
stop_words = set(stopwords.words('english')) | ||
lemmatizer = WordNetLemmatizer() | ||
|
||
|
||
text = text.lower() | ||
text = ''.join([char for char in text if char not in string.punctuation]) | ||
words = nltk.word_tokenize(text) | ||
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words] | ||
|
||
return words | ||
|
||
|
||
|
||
class PapersDataset(Dataset): | ||
def __init__(self, dataframe, doc2vec_model,target_type='TNCSI'): | ||
""" | ||
dataframe: 传入的DataFrame,包含训练或测试数据 | ||
doc2vec_model: 已训练的Doc2Vec模型 | ||
""" | ||
self.dataframe = dataframe | ||
self.doc2vec_model = doc2vec_model | ||
self.target_type = target_type | ||
def __len__(self): | ||
return len(self.dataframe) | ||
|
||
def __getitem__(self, idx): | ||
row = self.dataframe.iloc[idx] | ||
metadata = f"{row['title']} {row['abstract']}" | ||
processed_text = preprocess_text(metadata) | ||
vector = self.doc2vec_model.infer_vector(processed_text) | ||
if self.target_type.startswith('TNCSI'): | ||
label = row[self.target_type] | ||
else: | ||
label = row['cites'] | ||
return torch.tensor(vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) | ||
|
||
|
||
def train_doc2vec(documents): | ||
tagged_data = [TaggedDocument(words=preprocess_text(doc), tags=[i]) for i, doc in enumerate(documents)] | ||
model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs=40) | ||
return model | ||
|
||
|
||
|
||
class Attention(nn.Module): | ||
def __init__(self, hidden_size): | ||
super(Attention, self).__init__() | ||
self.linear = nn.Linear(hidden_size, 1) | ||
|
||
def forward(self, lstm_output): | ||
|
||
weights = torch.tanh(self.linear(lstm_output)) | ||
weights = torch.softmax(weights, dim=1) | ||
|
||
weighted = torch.mul(lstm_output, weights.expand_as(lstm_output)) | ||
|
||
return torch.sum(weighted, dim=1) | ||
|
||
class CitationModel(nn.Module): | ||
def __init__(self, embedding_dim, hidden_dim,target_type='TNCSI'): | ||
super(CitationModel, self).__init__() | ||
self.bi_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True) | ||
self.attention = Attention(hidden_dim * 2) | ||
self.fc = nn.Linear(hidden_dim * 2, 1) | ||
self.sigmoid = nn.Sigmoid() | ||
self.relu = nn.ReLU() | ||
self.target_type = target_type | ||
|
||
def forward(self, x): | ||
lstm_out, _ = self.bi_lstm(x) | ||
attention_out = self.attention(lstm_out) | ||
output = self.fc(attention_out) | ||
if self.target_type.startswith('TNCSI'): | ||
output = self.sigmoid(output) | ||
else: | ||
output = self.relu(output) | ||
return output | ||
|
||
|
||
|
||
def train_model(model, train_loader, criterion, optimizer, epochs, device): | ||
model.train() | ||
loss_history = [] | ||
for epoch in tqdm(range(epochs)): | ||
total_loss = 0 | ||
for inputs,targets in train_loader: | ||
inputs, targets = inputs.to(device).unsqueeze(1), targets.to(device).unsqueeze(1) | ||
optimizer.zero_grad() | ||
outputs = model(inputs) | ||
|
||
loss = criterion(outputs, targets) | ||
loss.backward() | ||
optimizer.step() | ||
total_loss += loss.item() | ||
avg_loss = total_loss / len(train_loader) | ||
loss_history.append(avg_loss) | ||
print(f'Epoch {epoch+1}, Loss: {avg_loss}') | ||
# else: | ||
# for inputs,_,targets in train_loader: | ||
|
||
# optimizer.zero_grad() | ||
# outputs = model(inputs) | ||
# | ||
# loss = criterion(outputs, targets) | ||
# loss.backward() | ||
# optimizer.step() | ||
# total_loss += loss.item() | ||
# avg_loss = total_loss / len(train_loader) | ||
# loss_history.append(avg_loss) | ||
# print(f'Epoch {epoch + 1}, Loss: {avg_loss}') | ||
|
||
print("Training complete. Loss history:") | ||
print(loss_history) | ||
|
||
# Evaluation function for NDCG | ||
def evaluate_model(model, test_loader, device,k=20): | ||
model.eval() | ||
pred_scores = [] | ||
target_scores = [] | ||
with torch.no_grad(): | ||
total_loss = 0 | ||
|
||
|
||
for inputs, target in test_loader: | ||
inputs = inputs.to(device).unsqueeze(1) # Ensure input is correctly shaped | ||
outputs = model(inputs) | ||
|
||
# Flatten outputs and targets for NDCG computation | ||
predicted_scores = outputs.squeeze() | ||
true_scores = target.squeeze() | ||
loss = nn.MSELoss()(predicted_scores.detach().cpu(),true_scores.detach().cpu()) | ||
total_loss += loss.item() | ||
print(predicted_scores) | ||
print(true_scores) | ||
print('-'*50) | ||
pred_scores.append(outputs) | ||
target_scores.append(true_scores) | ||
|
||
avg_loss = total_loss / len(test_loader) | ||
print(f'AVG MSE:{avg_loss}') | ||
|
||
all_pred = torch.cat(pred_scores, dim=0).squeeze() | ||
all_GT = torch.cat(target_scores, dim=0).squeeze() | ||
|
||
# all_pred = torch.Tensor(pred_scores) | ||
# all_GT = torch.Tensor(target_scores) | ||
ndcg = NDCG_k(all_pred, all_GT,k=k) | ||
print(ndcg) | ||
|
||
return ndcg | ||
# Main function | ||
def main(): | ||
csv_file = r' Desktop\NAID_train_extrainfo.csv' | ||
target_type = 'TNCSI_SP' | ||
|
||
train_data = pd.read_csv(csv_file) | ||
test_data = pd.read_csv(r' Desktop\NAID_test_extrainfo.csv') | ||
|
||
# Train the Doc2Vec model on training data abstracts | ||
train_documents = train_data['abstract'].tolist() | ||
doc2vec_model = train_doc2vec(train_documents) | ||
|
||
# Create training and testing datasets | ||
|
||
train_dataset = PapersDataset(dataframe=train_data, doc2vec_model=doc2vec_model,target_type=target_type) | ||
test_dataset = PapersDataset(dataframe=test_data, doc2vec_model=doc2vec_model,target_type=target_type) | ||
|
||
# Create DataLoaders | ||
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True) | ||
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False) | ||
|
||
# Model setup | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
print(device) | ||
model = CitationModel(embedding_dim=100, hidden_dim=1024,target_type=target_type).to(device) | ||
optimizer = torch.optim.Adam(model.parameters(), lr=0.005) | ||
criterion = nn.MSELoss() | ||
|
||
ep = 30 | ||
|
||
train_model(model, train_loader, criterion, optimizer, epochs=ep, device=device) | ||
torch.save(model.state_dict(), f'LSTM-{target_type}-{ep}.pth') | ||
|
||
# model.load_state_dict(torch.load(f'LSTM-{target_type}-{ep}.pth')) | ||
|
||
# Evaluate using NDCG | ||
evaluate_model(model, test_loader, device=device) | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.