-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgethistoric.py
158 lines (146 loc) · 5.65 KB
/
gethistoric.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Load long range month matches from web site"""
import pandas
import re
import datetime
import sys
sys.path.append('..\util')
import common
from common import split_line
import pyodbc
import numpy
import os
import logging
## Folder to save to
DIR_DATA = r'data/longrange'
## Columns to use
COLUMNS = ['Name','Year','Month','Value']
## Index columns
INDEX = COLUMNS[:3]
# using product created by weather office now
## Root URL for downloading
site = common.CONFIG.get('FireGUARD', 'url_agency_wx')
## URL for downloading
url = site + common.CONFIG.get('FireGUARD', 'url_agency_wx_longrange')
def save_file(url):
"""!
Save the given url
@param url URL to save
@return Path that URL was saved to
"""
return common.save_http(DIR_DATA, url)
def get_file_name(for_run):
"""!
Generates name to use for file with given run data
@param for_run datetime for run
@return File name that run data should be saved to
"""
return os.path.join(DIR_DATA, 'longrange_{}.csv'.format(for_run.strftime('%Y%m%d%H%M')))
def get_standard(force=False):
"""!
Save long range match data
@param force Whether or not to save if file already exists
@return Timestamp for saved data
"""
# HACK: this is inside the intranet so make sure we're not using a proxy if we're in the intranet
if common.CURRENT_PROXY is not None and common.CURRENT_PROXY.endswith(common.MNR_PROXY):
common.set_proxy(None)
logging.debug('Saving long range forecast matches')
filename = save_file(url)
filetime = os.path.getmtime(filename)
for_run = datetime.datetime.fromtimestamp(filetime)
# HACK: get rid of microseconds because database conversion fails
for_run = for_run.replace(microsecond=0)
# copy file to dated file name so that we can load them later on
common.copy_file(filename, get_file_name(for_run))
return load_file(for_run, force)
def load_file(for_run, force=False):
"""!
Load data for a given run into the database
@param for_run datetime to load data for
@param force Whether or not to save if file already exists
@return Timestamp for run data that was loaded
"""
if not force:
logging.debug('Checking if data is already present for long range matches generated at {}'.format(for_run))
exists = check_exists(for_run)
if exists:
logging.debug('Data already loaded - aborting')
return pandas.Timestamp(for_run)
filename = get_file_name(for_run)
df = pandas.read_csv(filename)
# throw out everything except years and months
df = df[df.columns[:13]]
# change months into numbers
df.columns = ['Year'] + range(1, 13)
df = pandas.melt(df,
id_vars=['Year'],
var_name='Month',
value_name='Value')
# remove NaN rows
df = df.query('Value == Value')
# get warnings later if we don't set this
df.is_copy = False
# convert to ratio while checking for % and other possible inputs
def fix_ratio(x):
if isinstance(x, str):
if '%' == x[-1]:
x = int(x[:-1])
x = int(x)
return x if 1 >= x else x / 100.0
#~ fix_ratio(df.values[0][2])
df['Value'] = df['Value'].apply(fix_ratio)
df['Generated'] = pandas.to_datetime(for_run)
#~ df['Generated'] = df['Generated'].astype('datetime64[s]')
df = df.set_index(['Generated', 'Year', 'Month'])
database = 'HINDCAST'
schema = 'HINDCAST'
final_table = 'DAT_HistoricMatch'
def do_insert_only(cnxn, table, data):
"""Insert and assume success because no duplicate keys should exist"""
# rely on deleting from FK table to remove everything from this table, so just insert
stmt_insert = common.make_insert_statement(table, data.reset_index().columns)
common.trans_insert_data(cnxn, data, stmt_insert)
try:
cnxn = common.open_local_db(database)
cur_df = df
cur_df = common.write_foreign(cnxn, schema, 'DAT_Historic', ['Generated'], common.trans_save_data, cur_df)
logging.debug('Writing data to {}'.format(final_table))
do_insert_only(cnxn, '[{}].[{}]'.format(schema, final_table), cur_df)
cnxn.commit()
finally:
cnxn.close()
return pandas.Timestamp(for_run)
def check_exists(for_run):
"""!
Check if data for given Generated time already exists
@param for_run datatime to check database for run data from
@return Whether or not data exists in database for given run
"""
try:
cnxn = common.open_local_db('HINDCAST')
df = pandas.read_sql('SELECT * FROM [HINDCAST].[DAT_Historic]', cnxn)
finally:
cnxn.close()
return 1 == len(df.query('Generated == \'{}\''.format(for_run)))
def load_past_records(year=None, force=False):
"""!
Determine which past run files we have and load them
@param year Year to load, or None to load all past years
@param force Whether or not to load if data already exists
@return None
"""
import glob
files = glob.glob(os.path.join(DIR_DATA, 'longrange_*.csv'))
def file_run(file):
start = file.index('_') + 1
end = file.index('.', start)
return file[start:end]
runs = list(set([file_run(d) for d in files]))
# sort so we do the dates chronologically
runs.sort()
for run in runs:
for_run = datetime.datetime.strptime(run, '%Y%m%d%H%M')
if year is None or for_run.year == year:
load_file(for_run, force)
if __name__ == "__main__":
get_standard()