@@ -19,49 +19,25 @@ class Scraper:
19
19
}
20
20
21
21
def __init__ (self ):
22
+ self .name = "general"
22
23
pass
23
24
24
25
def get_page (self , query , startIndex = 0 , qtype = '' ):
25
26
""" Fetch the google search results page
26
27
Returns : Results Page
27
28
"""
28
29
url = self .url
29
- if qtype == 'vid' :
30
- if self . name in [ 'yahoo ' ]:
30
+ if qtype == 'vid' and self . name in [ 'yahoo' , 'ask' , 'parsijoo' ,
31
+ 'bing ' ]:
31
32
url = self .videoURL
32
- elif self .name in ['ask' ]:
33
- url = self .videoURL
34
- payload = {self .queryKey : query , self .startKey : startIndex }
35
- response = requests .get (
36
- url , headers = self .headers , params = payload
37
- )
38
- return response
39
- else :
40
- url = self .url
41
- elif qtype == 'isch' :
42
- if self .name in ['yahoo' ]:
33
+ elif qtype == 'isch' and self .name in ['yahoo' , 'parsijoo' , 'bing' ]:
43
34
url = self .imageURL
44
- else :
45
- url = self .url
46
- elif qtype == 'news' :
47
- if self .name == 'baidu' :
48
- url = self .newsURL
49
- payload = {'word' : query , self .startKey : startIndex }
50
- response = requests .get (
51
- url , headers = self .headers , params = payload
52
- )
53
- return response
54
- elif self .name == 'parsijoo' :
55
- url = self .newsURL
56
- payload = {self .queryKey : query , 'page' : startIndex }
57
- response = requests .get (
58
- url , headers = self .headers , params = payload
59
- )
60
- return response
61
- elif self .name == 'yahoo' :
62
- url = self .newsURL
35
+ elif qtype == 'news' and self .name in ['baidu' , 'parsijoo' , 'mojeek' , 'bing' ]:
36
+ url = self .newsURL
63
37
payload = {self .queryKey : query , self .startKey : startIndex ,
64
38
self .qtype : qtype }
39
+ if self .name == 'mojeek' and qtype == 'news' :
40
+ payload ['fmt' ] = 'news'
65
41
response = requests .get (url , headers = self .headers , params = payload )
66
42
print (response .url )
67
43
return response
@@ -98,16 +74,13 @@ def search(self, query, num_results, qtype=''):
98
74
99
75
def call_appropriate_parser (self , qtype , soup ):
100
76
new_results = ''
101
- if qtype == 'vid' :
102
- if self . name in [ 'yahoo ' ]:
77
+ if qtype == 'vid' and self . name in [ 'yahoo' , 'ask' , 'parsijoo' ,
78
+ 'bing ' ]:
103
79
new_results = self .parse_video_response (soup )
104
- else :
105
- new_results = self .parse_response (soup )
106
- elif qtype == 'isch' :
107
- if self .name in ['yahoo' ]:
80
+ elif qtype == 'isch' and self .name in ['yahoo' , 'parsijoo' , 'bing' ]:
108
81
new_results = self .parse_image_response (soup )
109
- else :
110
- new_results = self .parse_response (soup )
82
+ elif qtype == 'news' and self . name in [ 'parsijoo' , 'mojeek' , 'baidu' , 'bing' ] :
83
+ new_results = self .parse_news_response (soup )
111
84
else :
112
85
new_results = self .parse_response (soup )
113
86
return new_results
@@ -123,95 +96,3 @@ def search_without_count(self, query):
123
96
soup = BeautifulSoup (response .text , 'html.parser' )
124
97
urls = self .parse_response (soup )
125
98
return urls
126
-
127
- def video_search (self , query , num_results , qtype = '' ):
128
- urls = []
129
- current_start = self .defaultStart
130
-
131
- while (len (urls ) < num_results ):
132
- response = self .get_page (query , current_start , qtype )
133
- soup = BeautifulSoup (response .text , 'html.parser' )
134
- if qtype == 'vid' :
135
- if self .name in ['yahoo' , 'ask' ]:
136
- new_results = self .parse_video_response (soup )
137
- else :
138
- new_results = self .parse_response (soup )
139
- else :
140
- new_results = self .parse_response (soup )
141
- if new_results is None :
142
- break
143
- urls .extend (new_results )
144
- current_start = self .next_start (current_start , new_results )
145
- return urls [: num_results ]
146
-
147
- def video_search_without_count (self , query ):
148
- """
149
- Search for the query and return set of urls
150
- Returns: list
151
- """
152
- urls = []
153
- if self .name in ['parsijoo' ]:
154
- url = self .videoURL
155
- payload = {self .queryKey : query }
156
- elif self .name in ['bing' ]:
157
- url = self .videoURL
158
- payload = {self .queryKey : query , self .videoKey : 'HDRSC3' }
159
- response = requests .get (url , headers = self .headers , params = payload )
160
- soup = BeautifulSoup (response .text , 'html.parser' )
161
- urls = self .parse_video_response (soup )
162
- if len (urls ) == 0 :
163
- return "No video with this Keyword"
164
- else :
165
- return urls
166
-
167
- def image_search_without_count (self , query ):
168
- """
169
- Search for the query and return set of urls
170
- Returns: list
171
- """
172
- urls = []
173
- if self .name in ['parsijoo' ]:
174
- url = self .imageURL
175
- payload = {self .queryKey : query }
176
- elif self .name in ['bing' ]:
177
- url = self .imageURL
178
- payload = {self .queryKey : query , self .imageKey : 'HDRSC2' }
179
- response = requests .get (url , headers = self .headers , params = payload )
180
- soup = BeautifulSoup (response .text , 'html.parser' )
181
- urls = self .parse_image_response (soup )
182
- return urls
183
-
184
- def news_search (self , query , num_results , qtype = '' ):
185
- """
186
- Search for the query and return set of urls
187
- Returns: list
188
- """
189
- urls = []
190
- if self .name == 'parsijoo' :
191
- current_start = self .newsStart
192
- else :
193
- current_start = self .defaultStart
194
-
195
- while (len (urls ) < num_results ):
196
- response = self .get_page (query , current_start , qtype )
197
- soup = BeautifulSoup (response .text , 'html.parser' )
198
- new_results = self .parse_news_response (soup )
199
- if new_results is None :
200
- break
201
- urls .extend (new_results )
202
- current_start = self .next_start (current_start , new_results )
203
- return urls [: num_results ]
204
-
205
- def news_search_without_count (self , query ):
206
- """
207
- Search for the query and return set of urls
208
- Returns: list
209
- """
210
- urls = []
211
- if self .name == 'mojeek' :
212
- url = self .newsURL
213
- payload = {self .queryKey : query , 'fmt' : 'news' }
214
- response = requests .get (url , headers = self .headers , params = payload )
215
- soup = BeautifulSoup (response .text , 'html.parser' )
216
- urls = self .parse_news_response (soup )
217
- return urls
0 commit comments