-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
66 lines (49 loc) · 2.3 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import urllib
import requests
import re
import pandas as pd
import wikipediaapi as wp
from bs4 import BeautifulSoup
#a script which searches the web for a particular item, goes to the wikipedia page for that item and downloads the first paragraph from the wiki page.
def get_wiki_para(query):
query += " wikipedia"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
#google returns different results for desktop and mobile queries. Specifying desktop here!
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
headers = {"user-agent" : USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
#Beautiful soup library gets the web page content in a 'nice' format.
soup = BeautifulSoup(resp.content, "html.parser")
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
URL = anchors[0]['href']
title = g.find('h3').text
#in all the weblinks in the first page of the google search results, go to the wikipedia link for the item. If a company/item doesn't have a wiki page,
# will have to have a backup in place which scrapes a description paragraph from some other source.
match = re.search('wikipedia', URL)
if(match):
break
page = requests.get(URL)
wiki = BeautifulSoup(page.text, 'html.parser')
#return the first paragraph of the wiki page
for i in wiki.select('p'):
return i.getText()
def print_categories(page):
categories = page.categories
for title in sorted(categories.keys()):
print("%s: %s" % (title, categories[title]))
# a function to get the a list of major the major cities in the world. If we have a list of all the cities, countries and regions, we can filter out the
# 'locations' category.
def get_cities_list():
URL = 'https://en.wikipedia.org/wiki/List_of_largest_cities'
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')
city_table = soup.find('table',{'class':'sortable wikitable mw-datatable'})
city_names = []
for row in city_table.find_all('tr')[1:]:
state_cell = row.find_all('a')[0]
states.append(state_cell.text)
print(states)