240808

ssocean · Aug 8, 2024 · fdb1b67 · fdb1b67
1 parent b7d438d
commit fdb1b67
Show file tree

Hide file tree

Showing 7 changed files with 4,345 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -78,9 +78,22 @@ OMP_NUM_THREADS=1 accelerate launch offcial_train.py \
 Then, type `sh train.sh` in the console. Wating for the training ends~
 
 ## Testing (batch)
+Similar to Fine-tuning, prepare `test.sh` as below:
+```
 python inference.py \
  --data_path ScImpactPredict/NAID/NAID_test_extrainfo.csv \
  --weight_dir path_to_runs_dir
+```
+Then, type `sh test.sh`.
 
 ## Testing (single article)
-Just modified the `single_pred.py` file, then type `python single_pred.py`.
+Just modified the `single_pred.py` file, then type `python single_pred.py`.
+
+## Model Weights
+First, download the LLaMA-3 pretrain weights at huggingface official sites.
+Then, download the provided LoRA weights (runs_dir) [here](https://drive.google.com/file/d/13-ugXsm35AuzOBUlL6jPacY_z8qVIb7x/view?usp=sharing).
+
+## Compare with Previos Methods 
+With a few adjustments based on your specific needs, it should work fine. Since these models train very quickly (less than few minutes on a single RTX 3080), we won’t be providing the trained models.
+
+### We are pretty confident in our methodology and experiments, and you should be able to achieve any of the performance reported in our paper.
diff --git a/TKPD/TKPD.csv b/TKPD/TKPD.csv
diff --git a/TKPD/prompt_keyword_async_search.py b/TKPD/prompt_keyword_async_search.py
@@ -0,0 +1,84 @@
+import os
+import csv
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from retry import retry
+import Levenshtein
+from tqdm import tqdm
+
+API_SECRET_KEY = "sk-xxx"
+BASE_URL = "xxx"
+os.environ["OPENAI_API_KEY"] = API_SECRET_KEY
+os.environ["OPENAI_API_BASE"] = BASE_URL
+from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
+from langchain_openai import ChatOpenAI
+
+def normalized_edit_distance(str1, str2):
+
+    str1 = str1.strip().lower()
+    str2 = str2.strip().lower()
+
+
+    edit_distance = Levenshtein.distance(str1, str2)
+
+
+    max_length = max(len(str1), len(str2))
+
+
+    normalized_distance = edit_distance / max_length if max_length != 0 else 0
+    return normalized_distance
+usr_prompts = ["Given the following title and abstract of the research paper, identify the core task or problem being addressed in few words. You MUST respond with the keyphrase ONLY in this format: xxx",
+               "Based on the given title and abstract, what is the main focus or task of the research? Summarize it in a few words. You MUST respond with the keyphrase ONLY in this format: xxx",
+               "Analyze the title and abstract provided to identify the central task or topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as certain model names, unless widely recognized.). You MUST respond with the keyword ONLY in this format: xxx"
+               ]
+@retry(delay=2)
+def get_chatgpt_field(title, abstract=None, sys_content=None, usr_prompt=None, extra_prompt=True, model="gpt-3.5-turbo-0125", temperature=0):
+    if not sys_content:
+        sys_content = (
+            "You are a profound researcher who is good at identifying the topic key phrase from paper's title and "
+            "abstract. Ensure that the topic key phrase precisely defines the research area within the article. For effective academic searching, such as on Google Scholar, the field should be specifically targeted rather than broadly categorized. For instance, use 'image classification' instead of the general 'computer vision' to enhance relevance and searchability of related literature.")
+    if not usr_prompt:
+        usr_prompt = ("Analyze the title and abstract provided to identify the central topic of the paper, which will be used as a keyword for searching related academic papers on Google Scholar. Avoid terms that are either too broad (such as 'deep learning' or 'computer vision') or too specific (such as obscure model names, unless widely recognized). Focus on a keyword that reflects the innovative aspect or core methodology of the study. You MUST respond with the keyword ONLY in this format: xxx")
+
+    messages = [SystemMessage(content=sys_content)]
+
+    extra_abs_content = '''
+    Given Title: Large Selective Kernel Network for Remote Sensing Object Detection
+    Given Abstract: Recent research on remote sensing object detection has largely focused on improving the representation of oriented bounding boxes but has overlooked the unique prior knowledge presented in remote sensing scenarios. Such prior knowledge can be useful because tiny remote sensing objects may be mistakenly detected without referencing a sufficiently long-range context, which can vary for different objects. This paper considers these priors and proposes the lightweight Large Selective Kernel Network (LSKNet). LSKNet can dynamically adjust its large spatial receptive field to better model the ranging context of various objects in remote sensing scenarios. To our knowledge, large and selective kernel mechanisms have not been previously explored in remote sensing object detection. Without bells and whistles, our lightweight LSKNet sets new state-of-the-art scores on standard benchmarks, i.e., HRSC2016 (98.46% mAP), DOTA-v1.0 (81.85% mAP), and FAIR1M-v1.0 (47.87% mAP).''' if abstract else ''
+    if extra_prompt:
+        messages += [HumanMessage(content=f'''{usr_prompt}\n\n{extra_abs_content}'''), AIMessage(content='remote sensing object detection')]
+
+    content = f'''{usr_prompt}\n
+                Given Title: {title}
+            '''
+    if abstract:
+        content += f'Given Abstract: {abstract}'
+    messages.append(HumanMessage(content=content))
+
+    chat = ChatOpenAI(model=model, temperature=temperature)
+
+    return chat.batch([messages])[0].content
+import csv
+from multiprocessing import Pool
+prompt = "Identify the research field from the given title and abstract. You MUST respond with the keyword ONLY in this format: xxx"
+def process_row(row):
+    title, abs, GT_kwd = row[0], row[1], row[2]
+    pred_kwd = get_chatgpt_field(title, abs, usr_prompt=prompt) # This should be replaced with the actual prediction logic
+    # Assuming normalized_edit_distance is defined elsewhere
+
+    ned = normalized_edit_distance(GT_kwd, pred_kwd)
+    print(f'GT:{GT_kwd} \t Pred:{pred_kwd} \t Ned:{ned}')
+    return ned
+
+def main():
+    with open(r'TKPD.csv','r', newline='', encoding='gbk') as input_csvfile:
+        reader = csv.reader(input_csvfile)
+        rows = [row for row in reader]
+    print(len(rows))
+    with Pool(12) as p:
+        results = p.map(process_row, rows)
+
+    average_distance = sum(results) / len(results) if results else 0
+    print(f"{prompt}: {average_distance}")
+
+if __name__ == '__main__':
+    main()
diff --git a/previous_methods/Doc2Vec&Bi-LSTM.py b/previous_methods/Doc2Vec&Bi-LSTM.py
@@ -0,0 +1,250 @@
+
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+from torch.utils.data import DataLoader
+import torch.nn as nn
+from sklearn.model_selection import train_test_split
+import nltk
+from tqdm import tqdm
+
+nltk.download('punkt')
+nltk.download('omw-1.4')
+
+# def dcg_at_k(scores, k):
+#     """
+#     scores: a list of relevance scores in predicted order
+#     k: number of results to consider
+#     """
+#     scores = np.asfarray(scores)[:k]
+#     return np.sum(scores / np.log2(np.arange(2, scores.size + 2)))
+#
+# def ndcg_at_k(predicted_scores, true_scores, k):
+#     """
+#     predicted_scores: model's predicted scores
+#     true_scores: ground truth scores
+#     k: number of results to consider
+#     """
+#     idcg = dcg_at_k(sorted(true_scores, reverse=True), k)
+#     dcg = dcg_at_k(predicted_scores, k)
+#     return dcg / idcg if idcg > 0 else 0
+
+import torch
+import numpy as np
+
+from sklearn.metrics import ndcg_score
+
+def NDCG_k(predictions, labels, k=20):
+    print(print(predictions.shape, labels.shape))
+    predictions = predictions.squeeze().detach().cpu().numpy()
+    labels = labels.squeeze().detach().cpu().numpy()
+    if len(predictions) < k:
+        return -1
+
+
+    ndcg = ndcg_score([labels], [predictions], k=k)
+
+    print("Average NDCG:", ndcg)
+    return ndcg
+
+# nltk.download('stopwords')
+# nltk.download('wordnet')
+
+
+def preprocess_text(text):
+    stop_words = set(stopwords.words('english'))
+    lemmatizer = WordNetLemmatizer()
+
+
+    text = text.lower()
+    text = ''.join([char for char in text if char not in string.punctuation])
+    words = nltk.word_tokenize(text)
+    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
+
+    return words
+
+
+
+class PapersDataset(Dataset):
+    def __init__(self, dataframe, doc2vec_model,target_type='TNCSI'):
+        """
+        dataframe: 传入的DataFrame，包含训练或测试数据
+        doc2vec_model: 已训练的Doc2Vec模型
+        """
+        self.dataframe = dataframe
+        self.doc2vec_model = doc2vec_model
+        self.target_type = target_type
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, idx):
+        row = self.dataframe.iloc[idx]
+        metadata = f"{row['title']} {row['abstract']}"
+        processed_text = preprocess_text(metadata)
+        vector = self.doc2vec_model.infer_vector(processed_text)
+        if self.target_type.startswith('TNCSI'):
+            label = row[self.target_type]
+        else:
+            label = row['cites']
+        return torch.tensor(vector, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)
+
+
+def train_doc2vec(documents):
+    tagged_data = [TaggedDocument(words=preprocess_text(doc), tags=[i]) for i, doc in enumerate(documents)]
+    model = Doc2Vec(tagged_data, vector_size=100, window=2, min_count=1, workers=4, epochs=40)
+    return model
+
+
+
+class Attention(nn.Module):
+    def __init__(self, hidden_size):
+        super(Attention, self).__init__()
+        self.linear = nn.Linear(hidden_size, 1)
+
+    def forward(self, lstm_output):
+
+        weights = torch.tanh(self.linear(lstm_output))
+        weights = torch.softmax(weights, dim=1)
+
+        weighted = torch.mul(lstm_output, weights.expand_as(lstm_output))
+
+        return torch.sum(weighted, dim=1)
+
+class CitationModel(nn.Module):
+    def __init__(self, embedding_dim, hidden_dim,target_type='TNCSI'):
+        super(CitationModel, self).__init__()
+        self.bi_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.attention = Attention(hidden_dim * 2)   
+        self.fc = nn.Linear(hidden_dim * 2, 1)   
+        self.sigmoid = nn.Sigmoid()
+        self.relu = nn.ReLU()
+        self.target_type = target_type
+
+    def forward(self, x):
+        lstm_out, _ = self.bi_lstm(x)
+        attention_out = self.attention(lstm_out)
+        output = self.fc(attention_out)
+        if self.target_type.startswith('TNCSI'):
+            output = self.sigmoid(output)
+        else:
+            output = self.relu(output)
+        return output
+
+
+
+def train_model(model, train_loader, criterion, optimizer, epochs, device):
+    model.train()
+    loss_history = []   
+    for epoch in tqdm(range(epochs)):
+        total_loss = 0
+        for inputs,targets in train_loader:
+            inputs, targets = inputs.to(device).unsqueeze(1), targets.to(device).unsqueeze(1)   
+            optimizer.zero_grad()
+            outputs = model(inputs)
+
+            loss = criterion(outputs, targets)
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        avg_loss = total_loss / len(train_loader)
+        loss_history.append(avg_loss)
+        print(f'Epoch {epoch+1}, Loss: {avg_loss}')
+        # else:
+        #     for inputs,_,targets in train_loader:
+
+        #         optimizer.zero_grad()
+        #         outputs = model(inputs)
+        #
+        #         loss = criterion(outputs, targets)
+        #         loss.backward()
+        #         optimizer.step()
+        #         total_loss += loss.item()
+        #     avg_loss = total_loss / len(train_loader)
+        #     loss_history.append(avg_loss)
+        #     print(f'Epoch {epoch + 1}, Loss: {avg_loss}')
+
+    print("Training complete. Loss history:")
+    print(loss_history)
+
+# Evaluation function for NDCG
+def evaluate_model(model, test_loader, device,k=20):
+    model.eval()
+    pred_scores = []
+    target_scores = []
+    with torch.no_grad():
+        total_loss = 0
+
+
+        for inputs, target in test_loader:
+            inputs = inputs.to(device).unsqueeze(1)  # Ensure input is correctly shaped
+            outputs = model(inputs)
+
+            # Flatten outputs and targets for NDCG computation
+            predicted_scores = outputs.squeeze()
+            true_scores = target.squeeze()
+            loss = nn.MSELoss()(predicted_scores.detach().cpu(),true_scores.detach().cpu())
+            total_loss += loss.item()
+            print(predicted_scores)
+            print(true_scores)
+            print('-'*50)
+            pred_scores.append(outputs)
+            target_scores.append(true_scores)
+
+    avg_loss = total_loss / len(test_loader)
+    print(f'AVG MSE:{avg_loss}')
+
+    all_pred = torch.cat(pred_scores, dim=0).squeeze()
+    all_GT = torch.cat(target_scores, dim=0).squeeze()
+
+    # all_pred = torch.Tensor(pred_scores)
+    # all_GT = torch.Tensor(target_scores)
+    ndcg = NDCG_k(all_pred, all_GT,k=k)
+    print(ndcg)
+
+    return ndcg
+# Main function
+def main():
+    csv_file = r' Desktop\NAID_train_extrainfo.csv'
+    target_type = 'TNCSI_SP'
+
+    train_data = pd.read_csv(csv_file)
+    test_data = pd.read_csv(r' Desktop\NAID_test_extrainfo.csv')
+
+    # Train the Doc2Vec model on training data abstracts
+    train_documents = train_data['abstract'].tolist()
+    doc2vec_model = train_doc2vec(train_documents)
+
+    # Create training and testing datasets
+
+    train_dataset = PapersDataset(dataframe=train_data, doc2vec_model=doc2vec_model,target_type=target_type)
+    test_dataset = PapersDataset(dataframe=test_data, doc2vec_model=doc2vec_model,target_type=target_type)
+
+    # Create DataLoaders
+    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
+
+    # Model setup
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(device)
+    model = CitationModel(embedding_dim=100, hidden_dim=1024,target_type=target_type).to(device)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
+    criterion = nn.MSELoss()
+
+    ep = 30
+
+    train_model(model, train_loader, criterion, optimizer, epochs=ep, device=device)
+    torch.save(model.state_dict(), f'LSTM-{target_type}-{ep}.pth')
+
+    # model.load_state_dict(torch.load(f'LSTM-{target_type}-{ep}.pth'))
+
+    # Evaluate using NDCG
+    evaluate_model(model, test_loader,  device=device)
+
+if __name__ == '__main__':
+    main()