Skip to content

Commit 9e09370

Browse files
authored
Fixes #472 #474 #476 Add News search in Baidu, Parsijoo & Mojeek (#486)
1 parent 8a91095 commit 9e09370

File tree

9 files changed

+158
-5
lines changed

9 files changed

+158
-5
lines changed

app/scrapers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ def feed_gen(query, engine, count=10, qtype=''):
4444
engine = old_names.get(engine, engine)
4545
if engine in ('quora', 'youtube'):
4646
urls = scrapers[engine].search_without_count(query)
47+
elif engine in ('baidu', 'parsijoo') and qtype == 'news':
48+
urls = scrapers[engine].news_search(query, count, qtype)
49+
elif engine == 'mojeek' and qtype == 'news':
50+
urls = scrapers[engine].news_search_without_count(query)
4751
elif engine in ('bing', 'parsijoo') and qtype == 'vid':
4852
urls = scrapers[engine].video_search_without_count(query)
4953
elif engine in ('bing', 'parsijoo') and qtype == 'isch':

app/scrapers/baidu.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Baidu(Scraper):
88
def __init__(self):
99
Scraper.__init__(self)
1010
self.url = 'https://www.baidu.com/s'
11+
self.newsURL = 'http://news.baidu.com/ns'
1112
self.defaultStart = 0
1213
self.queryKey = 'wd'
1314
self.startKey = 'pn'
@@ -28,3 +29,19 @@ def parse_response(soup):
2829
print('Baidu parsed: ' + str(urls))
2930

3031
return urls
32+
33+
@staticmethod
34+
def parse_news_response(soup):
35+
""" Parse the response and return set of urls
36+
Returns: urls (list)
37+
[[Tile1,url1], [Title2, url2],..]
38+
"""
39+
urls = []
40+
for h3 in soup.findAll('h3', {'class': 'c-title'}):
41+
title = h3.a.getText()
42+
link = h3.a.get('href')
43+
urls.append({'title': title, 'link': link})
44+
45+
print('Baidu parsed: ' + str(urls))
46+
47+
return urls

app/scrapers/generalized.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,21 @@ def get_page(self, query, startIndex=0, qtype=''):
4343
url = self.imageURL
4444
else:
4545
url = self.url
46+
elif qtype == 'news':
47+
if self.name == 'baidu':
48+
url = self.newsURL
49+
payload = {'word': query, self.startKey: startIndex}
50+
response = requests.get(
51+
url, headers=self.headers, params=payload
52+
)
53+
return response
54+
elif self.name == 'parsijoo':
55+
url = self.newsURL
56+
payload = {self.queryKey: query, 'page': startIndex}
57+
response = requests.get(
58+
url, headers=self.headers, params=payload
59+
)
60+
return response
4661
payload = {self.queryKey: query, self.startKey: startIndex,
4762
self.qtype: qtype}
4863
response = requests.get(url, headers=self.headers, params=payload)
@@ -163,3 +178,38 @@ def image_search_without_count(self, query):
163178
soup = BeautifulSoup(response.text, 'html.parser')
164179
urls = self.parse_image_response(soup)
165180
return urls
181+
182+
def news_search(self, query, num_results, qtype=''):
183+
"""
184+
Search for the query and return set of urls
185+
Returns: list
186+
"""
187+
urls = []
188+
if self.name == 'parsijoo':
189+
current_start = self.newsStart
190+
else:
191+
current_start = self.defaultStart
192+
193+
while (len(urls) < num_results):
194+
response = self.get_page(query, current_start, qtype)
195+
soup = BeautifulSoup(response.text, 'html.parser')
196+
new_results = self.parse_news_response(soup)
197+
if new_results is None:
198+
break
199+
urls.extend(new_results)
200+
current_start = self.next_start(current_start, new_results)
201+
return urls[: num_results]
202+
203+
def news_search_without_count(self, query):
204+
"""
205+
Search for the query and return set of urls
206+
Returns: list
207+
"""
208+
urls = []
209+
if self.name == 'mojeek':
210+
url = self.newsURL
211+
payload = {self.queryKey: query, 'fmt': 'news'}
212+
response = requests.get(url, headers=self.headers, params=payload)
213+
soup = BeautifulSoup(response.text, 'html.parser')
214+
urls = self.parse_news_response(soup)
215+
return urls

app/scrapers/mojeek.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ class Mojeek(Scraper):
88
def __init__(self):
99
Scraper.__init__(self)
1010
self.url = 'https://www.mojeek.co.uk/search'
11+
self.newsURL = 'https://www.mojeek.co.uk/search'
1112
self.defaultStart = 1
1213
self.startKey = 's'
1314
self.name = 'mojeek'
@@ -27,3 +28,23 @@ def parse_response(soup):
2728
print('Mojeek parsed: ' + str(urls))
2829

2930
return urls
31+
32+
@staticmethod
33+
def parse_news_response(soup):
34+
""" Parse response and returns the urls
35+
36+
Returns: urls (list)
37+
[[url1], [url2], ...]
38+
"""
39+
urls = []
40+
for a in soup.findAll('a', attrs={'class': 'ob'}):
41+
title = a.getText()
42+
url = a.get('href')
43+
urls.append({
44+
'title': title,
45+
'link': url
46+
})
47+
48+
print('Mojeek parsed: ' + str(urls))
49+
50+
return urls

app/scrapers/parsijoo.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from __future__ import print_function
22
from .generalized import Scraper
3+
try:
4+
from urllib.parse import unquote # Python 3
5+
except ImportError:
6+
from urllib import unquote # Python 2
37

48

59
class Parsijoo(Scraper):
@@ -10,7 +14,9 @@ def __init__(self):
1014
self.url = 'https://parsijoo.ir/web'
1115
self.imageURL = 'https://image.parsijoo.ir/image'
1216
self.videoURL = 'https://video.parsijoo.ir/video'
17+
self.newsURL = 'http://khabar.parsijoo.ir/search/'
1318
self.defaultStart = 0
19+
self.newsStart = 1
1420
self.startKey = 'co'
1521
self.name = 'parsijoo'
1622

@@ -71,3 +77,19 @@ def parse_image_response(soup):
7177
print('Parsijoo parsed: ' + str(urls))
7278

7379
return urls
80+
81+
@staticmethod
82+
def parse_news_response(soup):
83+
""" Parse the response and return set of urls
84+
Returns: urls (list)
85+
[[Tile1,url1], [Title2, url2],..]
86+
"""
87+
urls = []
88+
for div in soup.findAll('div', {'class': 'news-title-link'}):
89+
title = div.a.getText()
90+
link = unquote(div.a.get('href'))
91+
urls.append({'title': title, 'link': link})
92+
93+
print('Parsijoo parsed: ' + str(urls))
94+
95+
return urls

app/templates/index.html

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,16 @@ <h1><code>query-server</code></h1>
9898
<div id="type" class="btn-group btn-group-vertical" style="display:inline-flex;padding:0; margin: 0 auto;" data-toggle="buttons">
9999
<label class=" active typeButton" style="padding:10px;">General<br/>
100100
<input type="radio" name = "stype" value="" autocomplete="off" checked>
101-
</label>
101+
</label>
102102
<label class=" typeButton" style="padding:10px;">Images<br/>
103103
<input type="radio" name = "stype" value="isch" autocomplete="off">
104-
</label>
105-
<label class=" typeButton" style="padding:10px;">
106-
Video<br/>
104+
</label>
105+
<label class=" typeButton" style="padding:10px;">Video<br/>
107106
<input type="radio" name = "stype" value="vid" autocomplete="off">
108-
</label>
107+
</label>
108+
<label class=" typeButton" style="padding:10px;">News<br/>
109+
<input type="radio" name = "stype" value="news" autocomplete="off">
110+
</label>
109111
</div>
110112
</div>
111113
<div class="col-sm-2 col-xs-6">

test/test_baidu.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,16 @@ def test_parse_response():
1414
'link': u'mock_url'
1515
}]
1616
assert resp == expected_resp
17+
18+
19+
def test_parse_news_response():
20+
html_text = """<h3 class="c-title">
21+
<a href="mock_url" target="_blank">mock_title</a>
22+
</h3>"""
23+
dummy_soup = BeautifulSoup(html_text, 'html.parser')
24+
resp = Baidu().parse_news_response(dummy_soup)
25+
expected_resp = [{
26+
'title': u'mock_title',
27+
'link': u'mock_url'
28+
}]
29+
assert resp == expected_resp

test/test_mojeek.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,14 @@ def test_parse_response():
1212
}]
1313
resp = Mojeek().parse_response(dummy_soup)
1414
assert resp == expected_resp
15+
16+
17+
def test_parse_news_response():
18+
html_text = '<a href="mock_url" class="ob">mock_title</a>'
19+
dummy_soup = BeautifulSoup(html_text, 'html.parser')
20+
expected_resp = [{
21+
'title': u'mock_title',
22+
'link': u'mock_url'
23+
}]
24+
resp = Mojeek().parse_news_response(dummy_soup)
25+
assert resp == expected_resp

test/test_parsijoo.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,16 @@ def test_parse_image_response():
4545
}]
4646
resp = Parsijoo().parse_image_response(dummy_soup)
4747
assert resp == expected_resp
48+
49+
50+
def test_parse_news_response():
51+
html_text = """<div class="news-title-link">
52+
<a href="mock_url">mock_title</a>
53+
</div>"""
54+
dummy_soup = BeautifulSoup(html_text, 'html.parser')
55+
expected_resp = [{
56+
'title': u'mock_title',
57+
'link': u'mock_url'
58+
}]
59+
resp = Parsijoo().parse_news_response(dummy_soup)
60+
assert resp == expected_resp

0 commit comments

Comments
 (0)