-
Notifications
You must be signed in to change notification settings - Fork 0
/
Week 4 - Intermediate Importing Data in Python.py
368 lines (312 loc) · 11.2 KB
/
Week 4 - Intermediate Importing Data in Python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# Importing flat files from the web -
from urllib.request import urlretrieve
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
print(url)
urlretrieve(url, 'Files/winequality-white.csv')
# Example2 - Import Red Wine Data from Web
import pandas as pd
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# save file locally -
urlretrieve(url,'winequality-red.csv')
# Read file into a DataFrame and print its head
df = pd.read_csv('winequality-red.csv',sep=';')
print(df.head())
import matplotlib.pyplot as plt
# Opening and reading flat files directly from the web -
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
# read file into Pandas dataframe
df = pd.read_csv(url,sep =';')
print(df.head())
# plot a histogram out of first column of df -
pd.DataFrame.hist(df.loc[:,['fixed acidity']])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('çount')
plt.show()
# Importing non-flat files from the web -
import xlrd
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'
# Read in all sheets of Excel file: sheet_name = None lets import of all sheets
xls = pd.read_excel(url,sheet_name = None)
# Print the sheet names to the shell
print(xls.keys())
# Print the head of the first sheet (using its name, NOT its index)
print(xls['1700'].head())
### HTTP requests to import files from the web -
# GET requests using urllib -
# To extract HTML from the wikipedia homepage:-
from urllib.request import urlopen,Request
# specify the url
url = "https://www.wikipedia.org/"
# package the GET request using the function Request
request = Request(url)
# send the request & catch the response using urlopen
response = urlopen(request)
# use the returned HTML response object using the read() method; read method converts the response to get HTML as a string
html = response.read()
# close the response
response.close()
print(html)
# GET requests using Requests package -
import requests
url = "https://www.wikipedia.org/"
r = requests.get(url)
text = r.text
print(text)
# Import packages
from urllib.request import urlopen,Request
# Specify the url
url = "https://campus.datacamp.com/courses/1606/4135?ex=2"
# This packages the request: request
request = Request(url)
# Sends the request and catches the response: response
response = urlopen(response)
# Print the datatype of response
print(type(response))
# Be polite and close the response!
response.close()
# Scraping the web in Python - using BeautifulSoup
from bs4 import BeautifulSoup
# Example 1 -
url = 'https://www.crummy.com/software/BeautifulSoup/'
r = requests.get(url)
html_doc = r.text
# prettify the HTML and then extract the text
soup = BeautifulSoup(html_doc)
print(soup.prettify())
# Example 2 -
# Specify url: url
url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extracts the response as html: html_doc
html_doc = r.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)
# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()
# Print the response
print(pretty_soup)
# BeautifulSoup: getting the title of HTML page -
guido_title = soup.title
print(guido_title)
# getting the text of the HTML page
guido_text = soup.get_text()
print(guido_text)
# BeautifulSoup: getting the hyperlinks -
# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')
# Print the URLs to the shell
for link in a_tags:
print(link.get('href'))
# Introduction to APIs and JSONs -
import json
with open('snake.json','r') as json_file:
json_data = json.load(json_file)
type(json_data)
# Example 2 -
with open(r'D:\Learning\UCD Professional Academy\Course 1 - Professional Certificate in Data Analytics\Python Practice\world_countries.json') as file:
data = json.load(file)
print(type(data))
## APIs and interacting with the world wide web -
# OMDB Movie API
import requests
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=the+social+network'
# Package the request, send the request and catch the response:
r = requests.get(url)
# Decode the JSON data into a dictionary: json_data
json_data = r.json()
for key,value in json_data.items():
print(key + ':',value)
print(r.text)
# Checking out the Wikipedia API - (Wikipedia page for Pizza)
import requests
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'
# Package the request, send the request and catch the response:
r = requests.get(url)
# Decode the JSON data into a dictionary: json_data
json_data = r.json()
# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)
# The Twitter API and Authentication -
# Example -
# Import package
import tweepy
# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"
# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler('nZ6EA0FxZ293SxGNg8g8aP0HM','fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i' )
auth.set_access_token("1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy","X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx")
## Streaming tweets -
# Initialize Stream listener
l = MyStreamListener()
# Create your Stream object with authentication
stream = tweepy.Stream(auth, l)
# Filter Twitter Streams to capture data by the keywords:
stream.filter(['clinton','trump','sanders','cruz'])
## Load & explore your Twitter data -
# Import package
import json
# Define file path
tweets_data_path = 'tweets.txt'
# Initialize empty list to store tweets:
tweets_data = []
# Open connection to file
tweets_file = open(tweets_data_path,'r')
# Read in tweets and store in list
for line in tweets_file:
tweet = json_loads(line)
tweets_data.append(tweet)
# Close connection to file
tweets_file.close()
# Print the keys of the first tweet dict
print(tweets_data[0].keys())
# Importing Data in Python - Practice
# Example 1 -
from bs4 import BeautifulSoup
import requests
url = 'https://www.python.org/~guido'
r = requests.get(url)
d = r.text
s = BeautifulSoup(d)
print(type(r))
# Example 2 -
import pandas as pd
from urllib.request import urlretrieve
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
urlretrieve(url,'winequality-wh.csv')
data = pd.read_csv('winequality-wh.csv',sep=';')
print(data[['alcohol','quality']].head())
# Example 3 -
import requests
url = ('http://www.omdbapi.com/?apikey=72bc447a&t=the+social+network')
r = requests.get(url)
json_data = r.json()
print(json_data['Awards'])
# Example 4 -
url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text
s = BeautifulSoup(html_doc)
print(s.title)
## Coonect to SQL DB Practice -
# Example - 1
import pymysql
# database connection -
connection = pymysql.connect(host='localhost',user='root',passwd="",database="hospital_info")
cursor = connection.cursor()
# retrieve data from table -
retrieve = "SELECT * FROM doctor;"
# executing the query
cursor.execute(retrieve)
rows = cursor.fetchall()
for row in rows:
print(row)
# commit the connection -
connection.commit()
# close the connection -
connection.close()
# Example - 2
connection = pymysql.connect(host='localhost',user='root',passwd="",database="hospital_info")
cursor = connection.cursor()
# getting all tables present in hospital_info DB -
cursor.execute("SHOW TABLES;")
# print all tables in DB -
names_table = cursor.fetchall()
for table in names_table:
print(table)
# Example - 3
import pandas as pd
connection = pymysql.connect(host='localhost',user='root',passwd="",database="hospital_info")
cursor = connection.cursor()
# getting all tables present in hospital_info DB -
cursor.execute("SELECT * FROM doctor;")
df = pd.DataFrame(cursor.fetchall())
# name the columns and set index column -
df.columns = ['Doctor_Id','Doctor_Name','Hospital_Id','Joining_Date','Speciality','Salary','Experience']
df = df.set_index(['Doctor_Id'])
print(df)
## Scraping the web in Python Practice -
# Scraping the Monster Job Site -
import requests
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
# parse HTML code with BeautifulSoup -
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content,'html.parser')
# print the title of web page -
print(soup.title)
# print the text on the page -
print(soup.get_text())
# iterate through all of the hyperlinks on the page and print their URLs -
for link in soup.find_all('a'):
print(link.get('href'))
# finding elements by ID & prettifying it -
results = soup.find(id='ResultsContainer')
print(results.prettify())
# find Elements by HTML Class Name - select only the job postings:
job_elems = results.find_all('section', class_='card-content')
print(job_elems)
for job_elem in job_elems:
# Each job_elem is a new BeautifulSoup object.
# You can use the same methods on it as you did before.
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
print(title_elem)
print(company_elem)
print(location_elem)
print()
# Extract Text From HTML Elements - using .text
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue # disregard the problematic element and skip over it
print(title_elem.text)
print(company_elem.text)
print(location_elem.text)
print()
# Find Elements by Class Name and Text Content
# job titles in the page are kept within <h2> elements
python_jobs = results.find_all('h2', string='Python Developer')
## Working with APIs in Python -
# Implementation of GET request
import requests
response = requests.get('https://google.com/')
print(response)
# view the status of the response code
print(response.status_code)
# Using Response instance in a conditional expression
if response:
print('Response is successful')
else:
print('Request returned an error')
# Example 1 - API serving NASA’s awesome data
import requests
request = requests.get('http://api.open-notify.org')
print(request.text)
print(request.status_code)
# hit a fake end point that does not exist
request2 = requests.get('http://api.open-notify.org/fake-endpoint')
print(request2.status_code)
# number of astronauts in the space
people = requests.get('http://api.open-notify.org/astros.json')
print(people.text)
# make output more readable using .json method
people_json = people.json()
print(people_json)
# print the number of people in space
print('Number of people in space:',people_json['number'])
# print the name of people in space (using for loop)
for p in people_json['people']:
print(p['name'])
# International Space Station Current Location
import requests
data = requests.get("http://api.open-notify.org/iss-now.json")
print(data.status_code)
print(data.text)