Skip to content

Commit f517880

Browse files
RemoraxbhaveshAn
authored andcommitted
Addresses #468 Added bing news search (#489)
* Added bing news search * Fixed Travis
1 parent cf912cc commit f517880

File tree

4 files changed

+51
-144
lines changed

4 files changed

+51
-144
lines changed

app/scrapers/__init__.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,6 @@ def feed_gen(query, engine, count=10, qtype=''):
4444
engine = old_names.get(engine, engine)
4545
if engine in ('quora', 'youtube'):
4646
urls = scrapers[engine].search_without_count(query)
47-
elif engine in ('baidu', 'parsijoo', 'yahoo') and qtype == 'news':
48-
urls = scrapers[engine].news_search(query, count, qtype)
49-
elif engine == 'mojeek' and qtype == 'news':
50-
urls = scrapers[engine].news_search_without_count(query)
51-
elif engine in ('bing', 'parsijoo') and qtype == 'vid':
52-
urls = scrapers[engine].video_search_without_count(query)
53-
elif engine in ('bing', 'parsijoo') and qtype == 'isch':
54-
urls = scrapers[engine].image_search_without_count(query)
55-
elif engine in ('ask',) and qtype == 'vid':
56-
urls = scrapers[engine].video_search(query, count, qtype)
5747
else:
5848
urls = scrapers[engine].search(query, count, qtype)
5949
return urls

app/scrapers/bing.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,10 @@ def __init__(self):
1010
self.url = 'http://www.bing.com/search'
1111
self.videoURL = 'https://www.bing.com/videos/search'
1212
self.imageURL = 'https://www.bing.com/images/search'
13+
self.newsURL = 'https://www.bing.com/news/search'
1314
self.defaultStart = 1
1415
self.startKey = 'first'
1516
self.name = 'bing'
16-
self.videoKey = 'FORM'
17-
self.imageKey = 'FORM'
1817

1918
@staticmethod
2019
def parse_response(soup):
@@ -73,3 +72,26 @@ def parse_image_response(soup):
7372
print('Bing parsed: ' + str(urls))
7473

7574
return urls
75+
76+
@staticmethod
77+
def parse_news_response(soup):
78+
""" Parses the reponse and return set of urls
79+
Returns: urls (list)
80+
[[Tile1,url1], [Title2, url2],..]
81+
"""
82+
urls = []
83+
for div in soup.findAll('div', {'class': 't_s'}):
84+
link = div.find('a', {'class': 'title'})
85+
url = link['href']
86+
title = link.getText()
87+
title = title.replace('\n', '').replace(' ', '')
88+
desc = div.find('div', {'class': 'snippet'}).getText()
89+
desc = desc.replace('\n', '').replace(' ', '')
90+
url_entry = {'title': title,
91+
'link': url,
92+
'desc': desc}
93+
urls.append(url_entry)
94+
95+
print('Bing parsed: ' + str(urls))
96+
97+
return urls

app/scrapers/generalized.py

Lines changed: 13 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -19,49 +19,25 @@ class Scraper:
1919
}
2020

2121
def __init__(self):
22+
self.name = "general"
2223
pass
2324

2425
def get_page(self, query, startIndex=0, qtype=''):
2526
""" Fetch the google search results page
2627
Returns : Results Page
2728
"""
2829
url = self.url
29-
if qtype == 'vid':
30-
if self.name in ['yahoo']:
30+
if qtype == 'vid' and self.name in ['yahoo', 'ask', 'parsijoo',
31+
'bing']:
3132
url = self.videoURL
32-
elif self.name in ['ask']:
33-
url = self.videoURL
34-
payload = {self.queryKey: query, self.startKey: startIndex}
35-
response = requests.get(
36-
url, headers=self.headers, params=payload
37-
)
38-
return response
39-
else:
40-
url = self.url
41-
elif qtype == 'isch':
42-
if self.name in ['yahoo']:
33+
elif qtype == 'isch' and self.name in ['yahoo', 'parsijoo', 'bing']:
4334
url = self.imageURL
44-
else:
45-
url = self.url
46-
elif qtype == 'news':
47-
if self.name == 'baidu':
48-
url = self.newsURL
49-
payload = {'word': query, self.startKey: startIndex}
50-
response = requests.get(
51-
url, headers=self.headers, params=payload
52-
)
53-
return response
54-
elif self.name == 'parsijoo':
55-
url = self.newsURL
56-
payload = {self.queryKey: query, 'page': startIndex}
57-
response = requests.get(
58-
url, headers=self.headers, params=payload
59-
)
60-
return response
61-
elif self.name == 'yahoo':
62-
url = self.newsURL
35+
elif qtype == 'news' and self.name in ['baidu', 'parsijoo', 'mojeek', 'bing']:
36+
url = self.newsURL
6337
payload = {self.queryKey: query, self.startKey: startIndex,
6438
self.qtype: qtype}
39+
if self.name == 'mojeek' and qtype == 'news':
40+
payload['fmt'] = 'news'
6541
response = requests.get(url, headers=self.headers, params=payload)
6642
print(response.url)
6743
return response
@@ -98,16 +74,13 @@ def search(self, query, num_results, qtype=''):
9874

9975
def call_appropriate_parser(self, qtype, soup):
10076
new_results = ''
101-
if qtype == 'vid':
102-
if self.name in ['yahoo']:
77+
if qtype == 'vid' and self.name in ['yahoo', 'ask', 'parsijoo',
78+
'bing']:
10379
new_results = self.parse_video_response(soup)
104-
else:
105-
new_results = self.parse_response(soup)
106-
elif qtype == 'isch':
107-
if self.name in ['yahoo']:
80+
elif qtype == 'isch' and self.name in ['yahoo', 'parsijoo', 'bing']:
10881
new_results = self.parse_image_response(soup)
109-
else:
110-
new_results = self.parse_response(soup)
82+
elif qtype == 'news' and self.name in ['parsijoo', 'mojeek', 'baidu', 'bing']:
83+
new_results = self.parse_news_response(soup)
11184
else:
11285
new_results = self.parse_response(soup)
11386
return new_results
@@ -123,95 +96,3 @@ def search_without_count(self, query):
12396
soup = BeautifulSoup(response.text, 'html.parser')
12497
urls = self.parse_response(soup)
12598
return urls
126-
127-
def video_search(self, query, num_results, qtype=''):
128-
urls = []
129-
current_start = self.defaultStart
130-
131-
while (len(urls) < num_results):
132-
response = self.get_page(query, current_start, qtype)
133-
soup = BeautifulSoup(response.text, 'html.parser')
134-
if qtype == 'vid':
135-
if self.name in ['yahoo', 'ask']:
136-
new_results = self.parse_video_response(soup)
137-
else:
138-
new_results = self.parse_response(soup)
139-
else:
140-
new_results = self.parse_response(soup)
141-
if new_results is None:
142-
break
143-
urls.extend(new_results)
144-
current_start = self.next_start(current_start, new_results)
145-
return urls[: num_results]
146-
147-
def video_search_without_count(self, query):
148-
"""
149-
Search for the query and return set of urls
150-
Returns: list
151-
"""
152-
urls = []
153-
if self.name in ['parsijoo']:
154-
url = self.videoURL
155-
payload = {self.queryKey: query}
156-
elif self.name in ['bing']:
157-
url = self.videoURL
158-
payload = {self.queryKey: query, self.videoKey: 'HDRSC3'}
159-
response = requests.get(url, headers=self.headers, params=payload)
160-
soup = BeautifulSoup(response.text, 'html.parser')
161-
urls = self.parse_video_response(soup)
162-
if len(urls) == 0:
163-
return "No video with this Keyword"
164-
else:
165-
return urls
166-
167-
def image_search_without_count(self, query):
168-
"""
169-
Search for the query and return set of urls
170-
Returns: list
171-
"""
172-
urls = []
173-
if self.name in ['parsijoo']:
174-
url = self.imageURL
175-
payload = {self.queryKey: query}
176-
elif self.name in ['bing']:
177-
url = self.imageURL
178-
payload = {self.queryKey: query, self.imageKey: 'HDRSC2'}
179-
response = requests.get(url, headers=self.headers, params=payload)
180-
soup = BeautifulSoup(response.text, 'html.parser')
181-
urls = self.parse_image_response(soup)
182-
return urls
183-
184-
def news_search(self, query, num_results, qtype=''):
185-
"""
186-
Search for the query and return set of urls
187-
Returns: list
188-
"""
189-
urls = []
190-
if self.name == 'parsijoo':
191-
current_start = self.newsStart
192-
else:
193-
current_start = self.defaultStart
194-
195-
while (len(urls) < num_results):
196-
response = self.get_page(query, current_start, qtype)
197-
soup = BeautifulSoup(response.text, 'html.parser')
198-
new_results = self.parse_news_response(soup)
199-
if new_results is None:
200-
break
201-
urls.extend(new_results)
202-
current_start = self.next_start(current_start, new_results)
203-
return urls[: num_results]
204-
205-
def news_search_without_count(self, query):
206-
"""
207-
Search for the query and return set of urls
208-
Returns: list
209-
"""
210-
urls = []
211-
if self.name == 'mojeek':
212-
url = self.newsURL
213-
payload = {self.queryKey: query, 'fmt': 'news'}
214-
response = requests.get(url, headers=self.headers, params=payload)
215-
soup = BeautifulSoup(response.text, 'html.parser')
216-
urls = self.parse_news_response(soup)
217-
return urls

test/test_bing.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,17 @@ def test_parse_video_response():
4040
'link': link_video,
4141
}]
4242
assert resp == expected_resp
43+
44+
45+
def test_parse_news_response():
46+
html_text = """<div class="t_s"><div class="t_t"><a class="title"
47+
href="mock_url">mock_title</a></div><div class="snippet">
48+
mock_desc</div></div>"""
49+
dummy_soup = BeautifulSoup(html_text, 'html.parser')
50+
resp = Bing().parse_news_response(dummy_soup)
51+
expected_resp = [{
52+
'title': u'mock_title',
53+
'link': u'mock_url',
54+
'desc': u'mock_desc',
55+
}]
56+
assert resp == expected_resp

0 commit comments

Comments
 (0)