Skip to content

Commit d01c179

Browse files
fazeem84mariobehling
authored andcommitted
Increase number of results to 100 fossasia#59 (fossasia#62)
Updated google_search and overloaded the get_google_page function by introducing Start Index Parameter,the start index is sent to Google URL as http request parameter(This involves 10 http calls to google to fetch 100 result)
1 parent 471bbb0 commit d01c179

File tree

1 file changed

+19
-7
lines changed

1 file changed

+19
-7
lines changed

app/scraper.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,16 @@ def get_google_page(query):
8080
payload = {'q': query}
8181
response = requests.get('https://www.google.com/search', headers=header, params=payload)
8282
return response
83+
84+
def get_google_page(query,startIndex):
85+
""" Fetch the google search results page
86+
Returns : Results Page
87+
"""
88+
header = {
89+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}
90+
payload = {'q': query,'start':startIndex}
91+
response = requests.get('https://www.google.com/search', headers=header, params=payload)
92+
return response
8393

8494

8595
def google_search(query):
@@ -88,12 +98,13 @@ def google_search(query):
8898
[[Tile1,url1], [Title2, url2],..]
8999
"""
90100
urls = []
91-
response = get_google_page(query)
92-
soup = BeautifulSoup(response.text, 'html.parser')
93-
for h3 in soup.findAll('h3', {'class': 'r'}):
94-
links = h3.find('a')
95-
urls.append({'title': links.getText(),
96-
'link': links.get('href')})
101+
for count in range(0,10):
102+
response = get_google_page(query,count*10)
103+
soup = BeautifulSoup(response.text, 'html.parser')
104+
for h3 in soup.findAll('h3', {'class': 'r'}):
105+
links = h3.find('a')
106+
urls.append({'title': links.getText(),
107+
'link': links.get('href')})
97108

98109
return urls
99110

@@ -154,4 +165,5 @@ def feedgen(query, engine):
154165
urls = bing_search(query)
155166
result = urls
156167
print(result)
157-
return result
168+
print(len(result))
169+
return result

0 commit comments

Comments
 (0)