-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweatherScraping.py
168 lines (144 loc) · 7.25 KB
/
weatherScraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#######################################################################
#######################################################################
################### Code developed by: ########################
################### Unai Saralegui ########################
#######################################################################
#######################################################################
######### Scraping data from timeanddate.com #########
#### In this script historic data about the weather is scraped ####
#### from timeanddate.com. The data scraped are the ####
#### "Date","Temperature (C)","Weather","Wind (km/h)", ####
#### "Humidity (%)","Barometer (mbar)","Visibility (km)" values ####
#### for the years 2015 and 2016 in Berkeley, USA. ####
#######################################################################
#######################################################################
from bs4 import BeautifulSoup
import webkit_server
import pandas as pd
import numpy as np
import re
def getDate(kk):
aa = re.split(pattern='[A-z]+', string=kk, maxsplit=1)
aa[1] = aa[1].split(maxsplit=1)[1]
return aa
def html2DF(soup):
"""This function receives a BeautifulSoap object and returns a pandas data frame
with a table containing dates and different values by web scraping"""
#the values in the tables are defined by td
temp = soup.find_all('td')
kk=[] #a list to save each of the rows with td
for i in temp:
kk.append(i.text) #we just save the text
kk.pop() #don't need the last value, and it can be problematic to handle so we erase it
kk.append('') #add an empty character string to perform for iterations
byrow=list() #a new list for saving only the desired values
ini=list() #initialize a list that will be appended to byrow
for i in range(len(kk)):
if (kk[i]==''):
#when we find an empty character string append the current ini and start a new one
byrow.append(ini)
ini=list()
else:
#append current value to ini
ini.append(kk[i])
#the first item in byrow is from a table we don't need so we remove it
byrow=byrow[1:len(byrow)]
#we now remove items that do not fit with the required size
torem=list()
for i in range(len(byrow)):
if len(byrow[i])!=7: #the table has 7 elements
torem.append(i)
for i in torem:
#remove items that don't fullfil the requirements
del byrow[i]
#the values in the rows are not numeric and some have some characters we do not need
#so we process the strings to get the desired result
for i in range(len(byrow)):
test=byrow[i]
temper=int(test[0].split('\xa0°C')[0])
weather=test[1]
if test[2]=="No wind":
wind=0
elif test[2]=="N/A":
wind="NA"
else:
wind=int(test[2].split()[0])
if test[4]=="N/A":
hum="NA"
else:
hum=int(test[4].split('%')[0])
barom=int(test[5].split()[0])
visib=int(test[6].split('\xa0km')[0])
byrow[i] = [temper,weather,wind,hum,barom,visib]
#we define the header for the data frame by hand
header=["Date","Temperature (C)","Weather","Wind (km/h)","Humidity (%)","Barometer (mbar)","Visibility (km)"]
#we now pass to find the time of each of the measurements, which are in a th object
temp = soup.find_all('th')
#the first 21 elements are always from other values so we do not consider them
temp=temp[21:len(temp)]
#we create an empty list for saving the hours
times=list()
for i in temp:
times.append(i.text)
#the first element is a bit different as t contains the hour and the date
#so we call getDate to take tha time and the date
date = getDate(times[0])
times[0]=date[0] #hour
date=date[1] #day
date=date.split() #the day will be something like '21 Aug' or '21 de ago' if it is in spanish
#so we split it to get the values
if len(date)==3: #Depending if the page is opened in english or spanish its different
#in english it will be something like [1,'Aug'] and in spanish [1,'de','ago']
#so in the case it is in spanish we need to remove 'de'
del date[1]
#declare number corresponding to month in both english and spanish
months = {'ene':1 , 'feb':2 , 'mar':3 , 'abr':4 , 'may':5, 'jun':6, 'jul':7 ,'ago':8,'sep':9,'oct':10,'nov':11,'dic':12,
'Jan':1 , 'Feb':2 , 'Mar':3 , 'Apr':4 , 'May':5, 'Jun':6, 'Jul':7 ,'Aug':8,'Sep':9,'Oct':10,'Nov':11,'Dec':12}
date[1]=str(months[date[1]]) #pass month name to number
#we now pass to receive the year of the data
#which can be found in a h2 object
temp = soup.find_all('h2')
yy=temp[0].text.split()[1] #obtain the year
date.append(yy) #now date is [day,month,year]
#reorder date to obtain [year,month,day] to use join and obtain YYYY-MM-DD
date[0],date[2]=date[2],date[0]
fec="-".join(date) #YYYY-MM-DD
#we now merge the date with the hours to obtain 'YYYY-MM-DD hh:mm:ss'
for i in range(len(times)):
times[i]=fec+" "+times[i]
#all the data has been obtained, now we create a pandas data frame to save all such data
df = pd.DataFrame(np.random.randn(len(times), (len(header))),columns=header)
df['Date']=times
df[header[1:len(header)]]=byrow
return df
a=webkit_server.Client()
header="https://www.timeanddate.com/weather/usa/berkeley/historic?"
dfall = pd.DataFrame(columns=["Date","Temperature (C)","Weather","Wind (km/h)","Humidity (%)",
"Barometer (mbar)","Visibility (km)"])
try:
for year in [2015,2016]: #years 2015 and 2016
print(year)
for month in range(1,13): #all the months in the year
print(month)
url=[header,"month=",str(month),"&year=",str(year)]
URL="".join(url) # this URL is valid for month in year
a.visit(URL)
#once the month page is opened we can made the requests
response=a.body()
soup = BeautifulSoup(response,"html.parser")
#we first obtain the javascript commands we have to run
temp = soup.find_all('a',rel="nofollow") #commands to run to obtain data for different days
for i in temp:
a.eval_script(str(i).split("return ")[1].split("\"")[0]) #execute the command
#once the command is executed a False value will be returned and the new data will be charged in the page
response=a.body() #get the new body of the html code
soup = BeautifulSoup(response,"html.parser") #initialize the BeautifulSoup object
df1=html2DF(soup) ##obtain the data frame for year and month
dfall=pd.concat([dfall,df1]) #concat the new values with the previous ones
#Once the requests have ended save the resulting data frame to a csv file
print("Data obtained, saving...")
dfall.to_csv("scrapedData.csv",index=False)
except:
#Once the requests have ended save the resulting data frame to a csv file
print("An error happened, saving the obtained data...")
dfall.to_csv("scrapedData.csv",index=False)