-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt_paper.py
140 lines (127 loc) · 5.17 KB
/
gpt_paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Author: Burak Can KARA
Email: burakcankara@gmail.com
This script processes PDF files and analyzes them using GPT-4 API.
It extracts text from PDFs and generates detailed scientific analysis reports.
"""
import os
import requests
import fitz # PyMuPDF
import time
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Get configuration from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4') # Default to gpt-4 if not specified
# Validate environment variables
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY not found in .env file. Please add your API key.")
def extract_text_from_pdf(pdf_path):
"""Extracts text content from a PDF file."""
print(f"Extracting text from {pdf_path}...")
try:
doc = fitz.open(pdf_path)
text = ""
for page_num in range(doc.page_count):
print(f"Processing page {page_num + 1}...")
page = doc.load_page(page_num)
text += page.get_text()
doc.close()
print(f"Text extraction completed for {pdf_path}.")
return text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def send_text_to_gpt(text):
"""Sends text to GPT model and returns detailed analysis in scientific review format."""
if not text:
print("Input text is empty. Skipping process.")
return ""
print(f"Sending text to {OPENAI_MODEL}...")
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
prompt = (
f"Analyze the following text and provide a detailed evaluation according to scientific review process. Output format should be:\n"
"1. Article Citation\n"
"Title:\n"
"Authors:\n"
"Journal:\n"
"Volume and Issue:\n"
"Publication Date:\n"
"DOI:\n"
"Publisher:\n"
"2. Research Purpose and Hypothesis\n"
"Research Topic:\n"
"Hypothesis/Problem Statement:\n"
"3. Participants and Study Area\n"
"Participant Information:\n"
"Study Area:\n"
"4. Methodology\n"
"Data Collection Method:\n"
"Tools/Instruments Used:\n"
"Data Analysis Method:\n"
"5. Results\n"
"Key Findings:\n"
"Statistical Results:\n"
"6. Authors' Recommendations and Discussion\n"
"Research Success Status:\n"
"Authors' Recommendations and Future Research:\n"
"7. Scientific Contribution and Strengths/Weaknesses\n"
"Strengths:\n"
"Weaknesses:\n"
"8. Summary and Scientific Evaluation\n"
f"Text: {text}\n"
"Note: Technical terms (e.g., Multiscale Geographically Weighted Regression - MGRW) should be kept as is without translation."
)
data = {
"model": OPENAI_MODEL,
"messages": [
{"role": "user", "content": prompt}
],
"max_tokens": 800
}
while True:
try:
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
print("Successfully received response from GPT model.")
return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
elif response.status_code == 429:
print(f"Error: {response.status_code}, {response.text}. Waiting...")
time.sleep(15) # Wait when rate limit is reached
else:
print(f"Error: {response.status_code}, {response.text}")
return ""
except requests.exceptions.RequestException as e:
print(f"API request error: {e}")
return ""
def process_pdfs_in_directory(directory_path, output_directory):
"""Processes PDF files in the specified directory and saves the results."""
print(f"Processing PDF files in {directory_path}...")
if not os.path.exists(output_directory):
os.makedirs(output_directory)
print(f"Created directory: {output_directory}")
for filename in os.listdir(directory_path):
if filename.lower().endswith(".pdf"):
print(f"Processing file: {filename}")
pdf_path = os.path.join(directory_path, filename)
text = extract_text_from_pdf(pdf_path)
gpt_response = send_text_to_gpt(text)
if gpt_response:
# Save the result
output_file_path = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_output.txt")
with open(output_file_path, "w", encoding="utf-8") as output_file:
output_file.write(gpt_response)
print(f"Processed {filename} and saved results to {output_file_path}")
else:
print(f"Error processing {filename}, results not saved.")
if __name__ == "__main__":
pdf_directory = "Papers" # Directory containing PDF files
output_directory = "Result" # Directory for saving results
print("Starting PDF processing...")
process_pdfs_in_directory(pdf_directory, output_directory)
print("PDF processing completed.")