forked from arnav108276/InnoPatent
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathback.py
134 lines (122 loc) · 6.19 KB
/
back.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from flask import Flask, request, jsonify, render_template
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
app = Flask(__name__, static_folder='static', template_folder='templates')
# Configure logging to suppress unwanted messages
logging.basicConfig(level=logging.ERROR)
# Fitness function to calculate relevance of a result to the target query
def fitness(result, target_query):
title_score = target_query.lower() in result['title'].lower()
description_score = target_query.lower() in result['description'].lower()
return title_score * 2 + description_score
# Grey Wolf Optimization (GWO) to optimize the search results
def gwo_optimize_results(results, target_query):
wolves = results.copy()
wolves_fitness = [(wolf, fitness(wolf, target_query)) for wolf in wolves]
wolves_fitness.sort(key=lambda x: x[1], reverse=True)
alpha = wolves_fitness[0][0]
beta = wolves_fitness[1][0] if len(wolves_fitness) > 1 else None
delta = wolves_fitness[2][0] if len(wolves_fitness) > 2 else None
rest = [wolf[0] for wolf in wolves_fitness[3:]]
return [alpha, beta, delta] + rest if beta and delta else [alpha]
# Scrape Google Patents data
def scrape_google_patents(query):
chrome_options = Options()
#chrome_options.add_argument('--headless')
driver = webdriver.Chrome(service=Service(r'chromedriver.exe'), options=chrome_options)
results = []
try:
for page in range(1, 2): # Adjust page range as needed
url = f"https://patents.google.com/?q={query}&oq={query}&page={page}"
driver.get(url)
try:
section = WebDriverWait(driver, 2).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'section.style-scope.search-results'))
)
result_items = section.find_elements(By.CSS_SELECTOR, 'search-result-item')
for item in result_items:
try:
title = item.find_element(By.CSS_SELECTOR, '#htmlContent').text.strip()
description = item.find_element(By.CSS_SELECTOR, '#htmlContent').text.strip()
patent_id = item.find_element(By.CSS_SELECTOR, '[data-proto="OPEN_PATENT_PDF"]').text.strip()
authors = item.find_elements(By.CSS_SELECTOR, '.style-scope.search-result-item')[0].text.strip()
patent_url = f"https://patents.google.com/patent/{patent_id}/en"
image_url = item.find_element(By.CSS_SELECTOR, 'img').get_attribute('src') # Update based on your page structure
results.append({
'title': title,
'description': description,
'authors': authors,
'id': patent_id,
'url': patent_url,
'image': image_url # Include the image URL in the result
})
except Exception as e:
print(f"Error extracting patent info: {e}")
pass
except Exception as e:
print(f"Error finding section: {e}")
pass
finally:
driver.quit()
return results
# Scrape Espacenet data and map to Google Patents
def scrape_espacenet(query):
chrome_options = Options()
#chrome_options.add_argument('--headless')
driver = webdriver.Chrome(service=Service(r'chromedriver.exe'), options=chrome_options)
results = []
try:
for page in range(1, 2): # Adjust page range as needed
url = f"https://worldwide.espacenet.com/patent/search?q={query}&page={page}"
driver.get(url)
try:
items = WebDriverWait(driver, 2).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'article.item--wSceB4di'))
)
for item in items:
try:
title = item.find_element(By.CSS_SELECTOR, 'header.h2--2VrrSjFb').text.strip()
description = item.find_element(By.CSS_SELECTOR, '.copy-text--uk738M73').text.strip()
patent_id = item.find_element(By.TAG_NAME, 'a').get_attribute('href').split('/')[-1]
patent_url = f"https://patents.google.com/patent/{patent_id}/en"
# Espacenet doesn't always have images, so we handle it as an optional field
image_url = item.find_element(By.CSS_SELECTOR, 'img').get_attribute('src') if item.find_element(By.CSS_SELECTOR, 'img') else None
results.append({
'title': title,
'description': description,
'id': patent_id,
'url': patent_url,
'image': image_url # Include the image URL in the result if available
})
except Exception as e:
print(f"Error extracting patent info: {e}")
pass
except Exception as e:
print(f"Error finding items: {e}")
pass
finally:
driver.quit()
return results
@app.route('/')
def home():
return render_template('index2end.html')
@app.route('/search', methods=['GET'])
def search():
query = request.args.get('query')
if not query:
return jsonify({"error": "Query parameter is required"}), 400
google_results = scrape_google_patents(query)
espacenet_results = scrape_espacenet(query)
combined_results = google_results + espacenet_results
optimized_results = gwo_optimize_results(combined_results, query)
# Debugging: log the image URLs to ensure they are correct
for result in optimized_results:
print(".")
return jsonify({'results': optimized_results})
if __name__ == '__main__':
app.run(debug=True, host='127.0.0.1', port=5013)