-
Notifications
You must be signed in to change notification settings - Fork 8
/
hkex.py
188 lines (168 loc) · 6.06 KB
/
hkex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Project hkex.py
# support file: myfunctions.py
# by: Albert Lam <albert@lamfamily.hk>
#
import os
import datetime
import myfunctions as mf
import movingAverage as ma
import linearLeastSquare as lsq
# MAIN Function starts
# setting up storage Enviroments
# hdir : for download html and converted text file
# ddir : for storing the extracted data
# adir : for result from processing extracted data
wdir = os.getcwd()
hdir = wdir + '/HTML/'
ddir = wdir + '/Data/'
adir = wdir + '/Analysis/'
if not os.path.exists(hdir): os.mkdir(hdir)
if not os.path.exists(ddir): os.mkdir(ddir)
########## DOWNLOAD WEBPAGE
#
# DOWNLOADING DAILY QUOTATION WEBPAGES FROM HONG KONG STOCK EXCHANGE
# and strip all inline html tags to create a text version of the file
#
# Setting up Enviroument
# Read today's date in yymmdd format
now = datetime.datetime.now()
lst = str(now.year)[-2:]+('00'+str(now.month))[-2:]+('00'+str(now.day))[-2:]
# Set up the starting date for downloading the quotation wegpage.
# set default to '190430' then update it in the following order:
# 1. look for the last downloaded file date in env.text
# 2. check the list of html files already downloaded to determine the last day.
fst = '190430'
os.chdir(hdir)
if os.path.isfile('env.txt'):
f = open('env.txt', 'r')
for line in f:
if line.startswith('lastdate'):
fst = line[9:15]
break
f.close()
else:
# return a list of names of downloaded html files without extention
datelist = mf.extlist(hdir, '.html', 'N')
# extract the latest from all filename
if len(datelist) > 1:
datelist.sort()
fst = datelist[len(datelist)-1]
lastdate = fst
# Download the web page for next and consquence date from HKEX if they exist.
# Delete the files that is too small to contain useful data; update lastdate
# after each successful download
while fst < lst:
# advance fst to nextday and setup the URL and filenames
fst = mf.nextday(fst)
url = "https://www.hkex.com.hk/eng/stat/smstat/dayquot/d"+fst+"e.htm"
if mf.url_is_alive(url):
fname = fst + '.html'
print('Downloading : ', fname)
mf.dnload(url, fname)
finfo = os.stat(fname)
if finfo.st_size < 1024:
os.remove(fname)
else:
lastdate = fst
# record the last date which webpage was downloaded in env.txt
f = open('env.txt','w')
f.write('lastdate:'+lastdate+'\n')
f.close()
######### EXGTRACT DATA FROM HTML FILES
#
# EXTRACT THE SUMMARY OF THE DAILY ACTIVITY FROM EACH HTMLFILES
# Save the information in "quotations.csv"
#
os.chdir(ddir)
# create a company database if it doesn't exists
if not os.path.isfile('company.csv'): mf.crcof()
# create the quotations.csv file if doesn't exists
if not os.path.isfile("quotations.csv"):
fout = open('quotations.csv', 'w')
fout.write('code,date,tdn,high,low,close,ask,bid,turnover,volume\n')
fout.close()
# READ WHAT HAS BEEN DONE PREVIOUSLY TO AVOID REPEATITION
# retrive a list of all html files with extention in HTML directory
# and sort them
htmlfiles = mf.extlist(hdir, '.html', 'Y')
htmlfiles.sort()
# create a list of company already in the company.csv file
comlist = []
comfile = open('company.csv', 'r')
for line in comfile:
if line.startswith('code'): continue
co = line.split(',')
comlist.append(co[0])
comfile.close()
# dictionary of trading date which result had been read/extracted previously
dlist = dict()
if os.path.isfile('sessions.csv'):
sfile = open('sessions.csv', 'r')
for line in sfile:
if line.startswith('date'): continue
sn = line.split(',')
# dictionary pairing date with the annual session number
dlist[sn[0]] = sn[2].strip()
sfile.close()
os.remove('sessions.csv')
# create a new sessions file for the current run.
sfile = open('sessions.csv', 'w')
sfile.write('date,idx,tdnum\n')
# loop throught all the downloaded webpages
for file in htmlfiles:
# extract the date and its order among all webpages
# htmlfiles already been sorted previously
date = file[:6]
idx = str(htmlfiles.index(file))
# if page already been read previously, record it and preceed to next page.
if date in dlist:
sfile.write(date+','+idx+','+dlist[date]+'\n')
else:
# extract the data and add it to the "quotations.csv"
print('Processing : ', file)
tdn = mf.read_h(hdir+file, comlist)
# record the files has been processed before move on to next one
sfile.write(date+','+idx+','+tdn+'\n')
sfile.close()
# EXACT DATA OF COMPANY OF INTEREST AND SAVE IT IN AN INDIVIDUAL CSV FILE
#
# setup lower, upper and increament for moving average and linear least linearLeastSquare
range = {'lower':5,'upper':50,'skip':5} # lower, upper and inc
# gather a list of company which we have data on file
cinfo = dict()
comfile = open('company.csv', 'r')
for line in comfile:
if line.startswith('code'): continue
c = line.split(',')
cinfo[c[0]] = c[1]+' ('+c[2]+')'
comfile.close()
# ask user the stock code of all the companies they are inteested in
colist = []
code = '0'
while not code == 'q':
print('\nWhat is the stock code of the company you want me to extract? ')
code = input('Enter code or q to quite: ')
# exit if answer is 'q'
if code == 'q':
continue
# confirm with user and extract the data if code is in database
elif code in cinfo:
print('\nCompany with code: ', code, ' is ', cinfo[code])
ans = input('Is it correct? (y/n): ')
if ans == 'y':
colist.append(code)
t = input('\nDo you want me to prepare data for another company? (y/n): ')
if t == 'n': code = 'q'
else:
print("\nLet's try again.\n")
# ask for another code if it is not on record
else:
print('\nSorry, code is not in my record. Please try again.')
# SETTING UP Enviroments
if not os.path.exists(adir): os.mkdir(adir)
for code in colist:
mf.csv(code)
ma.mvavg(code,ddir,adir,range)
lsq.llsq(code,ddir,adir,range)
# inform user where to expect the data file is located
print('\nThe results are located in ', ddir, ' and ', adir)