From b83c4b6c962379c995a89d68b488b4255e63b85e Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 29 Jan 2018 23:02:54 +0530 Subject: [PATCH] Add News search in Baidu, Parsijoo & Mojeek --- app/scrapers/__init__.py | 4 +++ app/scrapers/baidu.py | 17 +++++++++++++ app/scrapers/generalized.py | 50 +++++++++++++++++++++++++++++++++++++ app/scrapers/mojeek.py | 21 ++++++++++++++++ app/scrapers/parsijoo.py | 22 ++++++++++++++++ app/templates/index.html | 12 +++++---- test/test_baidu.py | 13 ++++++++++ test/test_mojeek.py | 11 ++++++++ test/test_parsijoo.py | 13 ++++++++++ 9 files changed, 158 insertions(+), 5 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 4fd666a0..43f37e97 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -44,6 +44,10 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) + elif engine in ('baidu', 'parsijoo') and qtype == 'news': + urls = scrapers[engine].news_search(query, count, qtype) + elif engine == 'mojeek' and qtype == 'news': + urls = scrapers[engine].news_search_without_count(query) elif engine in ('bing', 'parsijoo') and qtype == 'vid': urls = scrapers[engine].video_search_without_count(query) elif engine in ('bing', 'parsijoo') and qtype == 'isch': diff --git a/app/scrapers/baidu.py b/app/scrapers/baidu.py index 82f810c2..d7fddc99 100644 --- a/app/scrapers/baidu.py +++ b/app/scrapers/baidu.py @@ -8,6 +8,7 @@ class Baidu(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://www.baidu.com/s' + self.newsURL = 'http://news.baidu.com/ns' self.defaultStart = 0 self.queryKey = 'wd' self.startKey = 'pn' @@ -28,3 +29,19 @@ def parse_response(soup): print('Baidu parsed: ' + str(urls)) return urls + + @staticmethod + def parse_news_response(soup): + """ Parse the response and return set of urls + Returns: urls (list) + [[Tile1,url1], [Title2, url2],..] + """ + urls = [] + for h3 in soup.findAll('h3', {'class': 'c-title'}): + title = h3.a.getText() + link = h3.a.get('href') + urls.append({'title': title, 'link': link}) + + print('Baidu parsed: ' + str(urls)) + + return urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index 204533a0..d73467d1 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -43,6 +43,21 @@ def get_page(self, query, startIndex=0, qtype=''): url = self.imageURL else: url = self.url + elif qtype == 'news': + if self.name == 'baidu': + url = self.newsURL + payload = {'word': query, self.startKey: startIndex} + response = requests.get( + url, headers=self.headers, params=payload + ) + return response + elif self.name == 'parsijoo': + url = self.newsURL + payload = {self.queryKey: query, 'page': startIndex} + response = requests.get( + url, headers=self.headers, params=payload + ) + return response payload = {self.queryKey: query, self.startKey: startIndex, self.qtype: qtype} response = requests.get(url, headers=self.headers, params=payload) @@ -163,3 +178,38 @@ def image_search_without_count(self, query): soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_image_response(soup) return urls + + def news_search(self, query, num_results, qtype=''): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name == 'parsijoo': + current_start = self.newsStart + else: + current_start = self.defaultStart + + while (len(urls) < num_results): + response = self.get_page(query, current_start, qtype) + soup = BeautifulSoup(response.text, 'html.parser') + new_results = self.parse_news_response(soup) + if new_results is None: + break + urls.extend(new_results) + current_start = self.next_start(current_start, new_results) + return urls[: num_results] + + def news_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name == 'mojeek': + url = self.newsURL + payload = {self.queryKey: query, 'fmt': 'news'} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_news_response(soup) + return urls diff --git a/app/scrapers/mojeek.py b/app/scrapers/mojeek.py index 0d447500..3c270a79 100644 --- a/app/scrapers/mojeek.py +++ b/app/scrapers/mojeek.py @@ -8,6 +8,7 @@ class Mojeek(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://www.mojeek.co.uk/search' + self.newsURL = 'https://www.mojeek.co.uk/search' self.defaultStart = 1 self.startKey = 's' self.name = 'mojeek' @@ -27,3 +28,23 @@ def parse_response(soup): print('Mojeek parsed: ' + str(urls)) return urls + + @staticmethod + def parse_news_response(soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[url1], [url2], ...] + """ + urls = [] + for a in soup.findAll('a', attrs={'class': 'ob'}): + title = a.getText() + url = a.get('href') + urls.append({ + 'title': title, + 'link': url + }) + + print('Mojeek parsed: ' + str(urls)) + + return urls diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 69e5e40a..d4e77a1a 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -1,5 +1,9 @@ from __future__ import print_function from .generalized import Scraper +try: + from urllib.parse import unquote # Python 3 +except ImportError: + from urllib import unquote # Python 2 class Parsijoo(Scraper): @@ -10,7 +14,9 @@ def __init__(self): self.url = 'https://parsijoo.ir/web' self.imageURL = 'https://image.parsijoo.ir/image' self.videoURL = 'https://video.parsijoo.ir/video' + self.newsURL = 'http://khabar.parsijoo.ir/search/' self.defaultStart = 0 + self.newsStart = 1 self.startKey = 'co' self.name = 'parsijoo' @@ -71,3 +77,19 @@ def parse_image_response(soup): print('Parsijoo parsed: ' + str(urls)) return urls + + @staticmethod + def parse_news_response(soup): + """ Parse the response and return set of urls + Returns: urls (list) + [[Tile1,url1], [Title2, url2],..] + """ + urls = [] + for div in soup.findAll('div', {'class': 'news-title-link'}): + title = div.a.getText() + link = unquote(div.a.get('href')) + urls.append({'title': title, 'link': link}) + + print('Parsijoo parsed: ' + str(urls)) + + return urls diff --git a/app/templates/index.html b/app/templates/index.html index 6a81295e..4fa27184 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -98,14 +98,16 @@

query-server

+ - + + +
diff --git a/test/test_baidu.py b/test/test_baidu.py index 6a91ad5a..8f2e07b6 100644 --- a/test/test_baidu.py +++ b/test/test_baidu.py @@ -14,3 +14,16 @@ def test_parse_response(): 'link': u'mock_url' }] assert resp == expected_resp + + +def test_parse_news_response(): + html_text = """

+ mock_title +

""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + resp = Baidu().parse_news_response(dummy_soup) + expected_resp = [{ + 'title': u'mock_title', + 'link': u'mock_url' + }] + assert resp == expected_resp diff --git a/test/test_mojeek.py b/test/test_mojeek.py index 3c3178b2..4416ce7c 100644 --- a/test/test_mojeek.py +++ b/test/test_mojeek.py @@ -12,3 +12,14 @@ def test_parse_response(): }] resp = Mojeek().parse_response(dummy_soup) assert resp == expected_resp + + +def test_parse_news_response(): + html_text = 'mock_title' + dummy_soup = BeautifulSoup(html_text, 'html.parser') + expected_resp = [{ + 'title': u'mock_title', + 'link': u'mock_url' + }] + resp = Mojeek().parse_news_response(dummy_soup) + assert resp == expected_resp diff --git a/test/test_parsijoo.py b/test/test_parsijoo.py index 8682aedf..e9e61767 100644 --- a/test/test_parsijoo.py +++ b/test/test_parsijoo.py @@ -45,3 +45,16 @@ def test_parse_image_response(): }] resp = Parsijoo().parse_image_response(dummy_soup) assert resp == expected_resp + + +def test_parse_news_response(): + html_text = """""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + expected_resp = [{ + 'title': u'mock_title', + 'link': u'mock_url' + }] + resp = Parsijoo().parse_news_response(dummy_soup) + assert resp == expected_resp