-
Notifications
You must be signed in to change notification settings - Fork 0
/
DownloadAndUnzip-Python2.py
300 lines (242 loc) · 12.6 KB
/
DownloadAndUnzip-Python2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# coding: utf-8
# In[1]:
########################################################
# Step 1: Download
#######################################################
#import some standard modules we can use to download URLs
import urllib2
# initialize a variable with the URL to download, from the website
# of the National Stock Exchange of India, which publishes each day a
# bunch of interesting market data
urlOfFileName = "http://www.nseindia.com/content/historical/EQUITIES/2015/JUL/cm17JUL2015bhav.csv.zip"
urlOfFileName
# In[4]:
# initialize a variable with the local file in which to store the URL
# (this is a path on my local desktop)
localZipFilePath = "/Users/swethakolalapudi/Desktop/cm17JUL2015bhav2.csv.zip"
localZipFilePath
# Now a bit of boilerplate code to actually download the file. The website
# of the NSE does not like bots (automated scripts) that attempt to scrape
# data from it, so it will block scripts unless they have something called
# the user-agent property set in their HTTP headers. Don't worry too much
# about what this boilerplate code means, just go along with it:)
hdr = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,*/*q=0.8',
'Accept-Charset':'ISO-8859-1;utf-8,q=0.7,*;q=0.3',
'Accept-Encoding':'none',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive'
}
hdr
# In[5]:
# Now for the code that actually downloads the page and stores to file
# Make the web request - just use a web browser like Mozilla or Chrome would
# This is where the strange boilerplate code that we just typed out comes in handy
webRequest = urllib2.Request(urlOfFileName,headers=hdr)
# Doing stuff with files is quite error-prone, so let's be careful and
# use try/except as safety nets
try:
# note that we are now inside the try loop, as the indentation shows
page = urllib2.urlopen(webRequest)
# save the contents of this page into a variable called 'content'
# These contents are literally the zip file from the URL (i.e. what
# we got when we just downloaded the file manually
content = page.read()
# Now save the contents of the zip file to disk locally
# 1. Open the file (barrel)
output = open(localZipFilePath,"wb")
# the 'w' indicates that we intend to write, i.e. put stuff into the
# barrel. the 'b' indicates that this is a binary (not a text) file
output.write(bytearray(content))
# 2. In the line above, we actually write contents to the file.
# Note how the function bytearray - from python's libraries -
# knows how to convert this binary file into a bytearray that can
# be written to file.
output.close()
# 3. The barrel (i.e. the file) is closed, sealed shut
except urllib2.HTTPError, e:
# we are now in the 'except' portion of the try/except block. This
# code will be executed only if an exception is thrown in the try block
# i.e. if something goes wrong.
# Note how we specifically 'handle' exceptions of the type HTTPError
print e.fp.read()
print "Looks like the file download did not go through, for url = ",urlOfFileName
# In[7]:
####################################################################
# Step 2: Unzip the downloaded file and extract
####################################################################
import zipfile, os
#initialize a variable with the local directory in which to extract
# the zip file above
localExtractFilePath = "/Users/swethakolalapudi/Desktop/"
# check if the zip file above was downloaded successfully..if
# not, there is little point in trying to extract!
if os.path.exists(localZipFilePath):
# we are inside the conditional, i.e. this code is indented
# because it will only be executed if the condition is true,
# i.e. if 'os.path.exists(localZipFilePath)' returns True
print "Cool!" + localZipFilePath + " exists..proceeding"
# A zip file can contain any number of files, we don't know
# how many upfront - so initialize an empty array in which to save
# the names of the files
listOfFiles = []
# Open the barrel, i.e. the zip file.
fh = open(localZipFilePath,'rb')
# Note the 'rb' above .. the 'r' signifies that we will read,
# and the 'b' that this is a binary file.
# Recall that binary files - unlike text files - need handlers
# that know what to do with them - that is exactly what we will
# use the zipfile library for.
zipFileHandler = zipfile.ZipFile(fh)
# zipFileHandler knows how to read the list of zipped up files
# Iterate over that list in a for-loop.
for filename in zipFileHandler.namelist():
# the indentation has changed again - we are inside the for
# loop, and the iterator variable is 'filename'
zipFileHandler.extract(filename,localExtractFilePath)
# let's add this to the list of files we have extracted
listOfFiles.append(localExtractFilePath + filename)
print "Extracted " + filename + " from the zip file to " + (localExtractFilePath + filename)
# we are outside the for-loop now, hence the unindent
print "In total, we extracted ", str(len(listOfFiles)), " files"
fh.close()
# Don't forget to close the file, ie to seal the barrel!
# In[26]:
###################################################################
# Step 3: Parse the extracted CSV and create a table of stock info
###################################################################
import csv
# import the CSV module, which knows how to do stuff with CSV files
# (Aside: CSV = Comma-Separated-Values, a file format in which each
# line is a list of values separated by commas). We need a handler
# because there are edge cases (for instance cell values themselves
# containing commas) that the module knows how to handle.
oneFileName = listOfFiles[0]
# we know that the zip file downloaded has only file that we care about
# so access it using the indexing operator. Remember of course that
# arrays are indexed from zero.
lineNum = 0
# Keep a list of lists in which to store the output. For each stock,
# we care about a. how much that stock moved today (% change versus
# yesterday) and b. how heavily the stock was traded (the value that
# changed hands today)
listOfLists = []
# THis variable above will track the 3 columns we care about:
# ticker, % change and value traded
with open(oneFileName,'rb') as csvfile:
# note that the indent changed because of the 'with' operator
# 'with' is a super-powerful construct that allows us to open a
# file and do stuff with it, and not worry about explicitly
# opening and closing the file. When the with block opens,
# the file is explicitly opened, and when the with block ends,
# the file is implicitly closed. BTW, note that the 'rb' indicates
# that we only plan to read the file, and that we will use a handler
# (which in this case is the csv library we imported above)
lineReader = csv.reader(csvfile,delimiter=",",quotechar="\"")
# The CSV handler (the csv.reader function) knows how to read
# a csv file, 1 line at a time. This handler needs 2 bits of info
# from us: (a) what delimiters are used to separate cell values (in
# our case, and with all csvs, it is the ',' character) and (b) how
# cell values that contain the separator will be special-cased.
# this is done in most csv files using quotes. Thus, if a cell inside
# the file itself contains a comma, the entire cell value will be in
# quotes.
for row in lineReader:
# the linereader can be iterated over, one line at a time
lineNum = lineNum + 1
# what line are we on? keep track so we can skip the header
if lineNum == 1:
print "Skipping the header row"
continue
# remember that continue will skip to the next line
# in the for loop
# Ok! Everything in life is a list. The CSV file is a list of
# lines, and each line is a list of words. We know from the
# header that:
# column 1 (index = 0) is the stock ticker
# column 6 (index = 5) is the last closing price
# column 8 (index = 7) has yesterday's last closing price
# column 10 (index = 9) has the traded value in India Rupees
symbol = row[0]
close = row[5]
prevClose = row[7]
tradedQty = row[9]
pctChange = float(close)/float(prevClose) - 1
oneResultRow = [symbol,pctChange,float(tradedQty)]
# oneResulRow will look like this: ['SBI',0.034,100000.0]
listOfLists.append(oneResultRow)
# listOfLists will look like this:
# [ ['SBI',0.034,100000.0] , ['AXISBANK',0.056,800900.0] ]
# we got hold of all the quantities that we care about for
# each stock, and created one row that we added into our
# running result
print symbol, "{:,.1f}".format(float(tradedQty)/1e6) + "M INR", "{:,.1f}".format(pctChange*100)+"%"
# the indent has changed, we are out of the for loop, but the
# last line of the for loop prints out a nicely formatted message.
# Note that the "{:,.1f}".format helps to print nice comma-separated
# numbers
# btw, note that when we exit the with block, the file is automatically
# closed
print "Done iterating over the file contents - the file is closed now!"
print "We have stock info for " + str(len(listOfLists)) + " stocks"
# In[27]:
####################################################################
# Step 4: Sort the list of lists!
####################################################################
# we have spent a lot of time talking about the power of for loops
# but Python has something even more powerful that is actually the
# 'idiomatic' Python way of doing stuff: lambda functions.
# For now, let's just use a lambda function to sort our list of lists
# we won't go into a whole lot of detail about what lambda functions are,
# beyond saying that they are a way of saying 'Dear list, here is a function,
# please apply this function to each element of yourself'
# In general, in pure and true Python, the use of for loops is
# minimized, and that of lambda functions is preferred. But our
# objective in this class is to understand the basics of programming
# via python, so we will continue with our way of doing stuff using
# for loops
# After that long segue, back to the task at hand of sorting a list
# of lists.
listOfListsSortedByQty = sorted(listOfLists, key=lambda x:x[2], reverse=True)
# Here we sort the list of lists by column 3 (index = 2). The
# reverse = True means that sort will be descending
listOfListsSortedByQty = sorted(listOfLists, key=lambda x: x[1], reverse=True)
listOfListsSortedByQty
# In[29]:
########################################################################
# Step 5: Write out an excel file with the summary of the top movers,
# and most heavily traded files
########################################################################
import xlsxwriter
# import the xlsxwriter library which does all the magic
excelFileName = "/Users/swethakolalapudi/cm17JUL2015bhav.xlsx"
# Initialize a variable with the name of the excel file we will create
workbook = xlsxwriter.Workbook(excelFileName)
# create a workbook. THis is analogous to opening the barrel
worksheet = workbook.add_worksheet("Summary")
# create a worksheet (empty) named 'Summary'
worksheet.write_row("A1",["Top Traded Stocks"])
worksheet.write_row("A2",["Stock","% Change","Value Traded (INR)"])
# the way to write stuff into the excel is by specifying
# a. the cell address eg "A1", in the standard excel format
# b. the list of values to be written, 1 per cell starting from that address
# Let's write out the 5 most heavily traded stocks
# we already have the stocks sorted by how much they were traded
# in the list of the first 5 numbers, starting from 0
# Do this using the range function. range(5) is python shorthand
# for a list of the first 5 numbers, starting from 0
# Thus, range(5) == [0,1,2,3,4]
# Set up a for loop over this list - the iterator variable is
# called rowNum
for rowNum in range(5):
# get the corresponding row from our sorted result
oneRowToWrite = listOfListsSortedByQty[rowNum]
# write this out to the excel spreadsheet
# while doing this, remember to add 1 to the row number
# why? because Excel arrays are indexed from 1 while python
# arrays are indexed from 0
worksheet.write_row("A" + str(rowNum + 3), oneRowToWrite)
# as the changed indentation tells us, we are done with the for loop
# Close the file (ie seal the barrel!)
workbook.close()
# In[ ]: