diff --git a/.gitignore b/.gitignore index 402d8a7..b026416 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ official_runs/* *.0 official_runs* *.pt -*.pth \ No newline at end of file +*.pth +/.idea diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index a9d8957..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml -*.out diff --git a/.idea/ScitePredict.iml b/.idea/ScitePredict.iml deleted file mode 100644 index d0876a7..0000000 --- a/.idea/ScitePredict.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index f281ec3..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 696dc8a..edad29d 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,7 +1,7 @@ - - + \ No newline at end of file diff --git a/previous_methods/GPT_Predict.py b/previous_methods/GPT_Predict.py index eaacd0f..5e8b5d6 100644 --- a/previous_methods/GPT_Predict.py +++ b/previous_methods/GPT_Predict.py @@ -1,85 +1,33 @@ -import pandas as pd - -from tools.test import get_filename_without_extension -import copy -import json -import os -from urllib.error import URLError - -import requests -import tiktoken -from bs4 import BeautifulSoup -from langchain_core.exceptions import OutputParserException - def get_filename_without_extension(file_path): # Extract the filename without extension filename_without_extension = os.path.splitext(os.path.basename(file_path))[0] return filename_without_extension -import pandas as pd -from concurrent.futures import ThreadPoolExecutor, as_completed +import os +import sys from concurrent.futures import ThreadPoolExecutor, as_completed -from tqdm import tqdm - -from sqlalchemy.orm.exc import NoResultFound -from sqlalchemy.orm import declarative_base - -from langchain.output_parsers import PydanticOutputParser, OutputFixingParser -from pydantic import BaseModel, Field -from typing import Dict, List -from langchain.output_parsers import PydanticOutputParser, OutputFixingParser import langchain +from langchain.output_parsers import PydanticOutputParser +from pydantic import BaseModel, Field +from tqdm import tqdm -from langchain.chains import LLMChain - -import os - -from langchain.prompts import PromptTemplate - -from langchain.chains.question_answering import load_qa_chain - - -import os -import sys BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(BASE_DIR) -from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter -from retry import retry langchain.debug = False -import arxiv import time import ssl ssl._create_default_https_context = ssl._create_unverified_context -import glob -from langchain_community.document_loaders import PDFMinerLoader from langchain_community.chat_models import ChatOpenAI from database.DBEntity import * -from furnace.arxiv_paper import Arxiv_paper, get_arxiv_id_from_url -from sqlalchemy import create_engine, and_ -from sqlalchemy.orm import sessionmaker, scoped_session -import logging -import datetime - -engine = create_engine('xxx/scitepredict') - -Base = declarative_base() - -Base.metadata.create_all(engine) - - -Session = sessionmaker(bind=engine) -session = Session() -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) -session_factory = scoped_session(SessionLocal) -import PyPDF2 + items = [ "Engaging", "Controversial", "Rigorous", "Innovative", "Accessible", "Methodical", "Concise", "Persuasive", "Comprehensive", "Insightful", "Relevant", "Objective", "Replicable", "Structured", "Coherent", "Original", @@ -91,11 +39,19 @@ def get_filename_without_extension(file_path): "Speculation-driven", "Unethical", "Easy to understand", "Dull", "Well written", "Empirical", "Circumlocutory" ] +class GPT_Paper_Response_Fixer(BaseModel): + IMPACT: float = Field( + description="The predicted acadmic impact value range from 0 to 1. Results are rounded to two decimal places. e.g., 0.46") + + +lpqa_parser = PydanticOutputParser(pydantic_object=GPT_Paper_Response_Fixer) + + def parse_scores(content): - + # 解析评分内容 try: scores = [int(line.split()[1]) for line in content.split('\n')] - + # 计算均值 mean_score = sum(scores) / len(scores) return mean_score except Exception as e: @@ -103,7 +59,10 @@ def parse_scores(content): return 0 def paper_rating(abstract): - # download_paper(row, out_dir=r'J:\arxiv') + ''' + This is the original prompt used in the paper "Can ChatGPT be used to predict citation counts, readership, and social media interaction? An exploration among 2222 scientific abstracts" + Resulting poor performance with NDCG@20 below 0.1. + ''' prompt = f"Please rate the following abstract on each of the 60 items from 0 = Not at all to 100 = Very much. Only provide the numbers. For example:\n\n" prompt += "1. 65\n2. 50\n3. 5\n4. 95\n5. …\n\n" @@ -118,60 +77,95 @@ def paper_rating(abstract): return parse_scores(content) + + + + +def chat_qianfan(prompt): + import qianfan + # This is used to calculat 'LLaMA-3-generated' in Tab.2. You have to regist qianfan or try another LLM Inference API provider. + chat_comp = qianfan.ChatCompletion() + + # 指定特定模型 + resp = chat_comp.do(model="Meta-Llama-3-8B", messages=[{ + "role": "user", + "content": prompt + }]) + + print(resp["body"]) + return resp["body"].get('result') +def paper_rating_improved(row): + title = row['title'] + abstract = row['abstract'] + prompt = f'''Based on the following information, predict the academic impact of this paper as a single number between 0 and 1. Output only the number, with no additional text: + Title: {title} + Abstract: {abstract} + ONLY output a single number representing the future academic impact between 0 and 1. e.g., 0.69''' + + try: + + impact = float(chat_qianfan(prompt)) + time.sleep(0.5) + except Exception as e: + print(e) + return None + + return impact + def main(): - - data = pd.read_csv(r'xxx\NAID\NAID_test_extrainfo.csv') + # 读取数据 + data = pd.read_csv(r'NAID\NAID_test_extrainfo.csv') - scores = [] - - with ThreadPoolExecutor(max_workers=10) as executor: - - future_to_abstract = {executor.submit(paper_rating, abstract): abstract for abstract in data['abstract']} - - for future in tqdm(as_completed(future_to_abstract)): + with ThreadPoolExecutor(max_workers=1) as executor: # 这里将线程数设置为10 + + futures = [executor.submit(paper_rating_improved, row) for _, row in data.iterrows()] + + + for future in tqdm(as_completed(futures)): score = future.result() - scores.append(score) - + if score: + scores.append(score) + else: + scores.append(-1) + data['average_score'] = scores + data = data[data['average_score'] >= 0] - columns_to_save = ['id', 'cites', 'TNCSI', 'TNCSI_SP', 'abstract', 'average_score'] - data[columns_to_save].to_csv(r'gpt_predict.csv', index=False) + data[columns_to_save].to_csv(r'gpt_predict_llama-3.csv', index=False) import pandas as pd from sklearn.metrics import ndcg_score -import numpy as np def calculate_ndcg(file_path): - + data = pd.read_csv(file_path) - + if 'average_score' not in data.columns or 'cites' not in data.columns: return "The required columns are not in the dataframe." - - y_true = data['cites'].to_numpy() + + y_true = data['TNCSI_SP'].to_numpy() y_score = data['average_score'].to_numpy() # Reshape data for ndcg calculation (1, -1) as ndcg expects at least 2D arrays y_true = y_true.reshape(1, -1) y_score = y_score.reshape(1, -1) - + ndcg = ndcg_score(y_true, y_score,k=20) return ndcg - - +# if __name__ == "__main__": - # main() - ndcg_value = calculate_ndcg('gpt_predict.csv') - print(f"The NDCG value is: {ndcg_value}") \ No newline at end of file + main() + ndcg_value = calculate_ndcg('gpt_predict_improved.csv') + print(f"The NDCG value is: {ndcg_value}")