-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrypto_GetData.py
789 lines (658 loc) · 32 KB
/
Crypto_GetData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 16 21:32:54 2018
@author: Dean
"""
#%%
import numpy as np
import pandas as pd
import time
import pickle
from private_keys import cryptowatch_public
from DataTypes import printmd
#*******************************************************************************
class DataLoader:
def __init__(self, filename):
self.filename = filename
filehandler = open(filename, 'rb')
self.package = pickle.load(filehandler)
filehandler.close()
#*******************************************************************************
def GetDurationAvailable(self, pair):
dur_avail = data[pair]['time'].iloc[-1] - data[pair]['time'].iloc[0]
return dur_avail
#*******************************************************************************
def GetPairsSummary(self):
data = self.package['data']
pairs = []
dur_avail = []
last_time = []
for pair in data:
pairs.append(pair)
dur_avail.append(data[pair]['time'].iloc[-1] - data[pair]['time'].iloc[0])
last_time.append(data[pair]['time'].iloc[-1])
df = pd.DataFrame({'pair':pairs, 'dur_avail':dur_avail, 'last_time':last_time})
return df
#*******************************************************************************
def GetHourlyDf(self, coins, num_hours, verbose=1):
"""Grab the hourly data from file, as a list of DataFrames
Args:
filename (str): the file to load from
coins (list): a list of coins of interest. The reference coin will be automatically chosen (USD)
num_hours (int): the number of hours (data points) for each coin
Returns:
list: A list of DataFrames. 1 per coin
"""
data = self.package['data']
# 'data' is a dictionary where the key indicates the trading pair
dfs = [] # Output is a list of dataframes
# For each coin, pick a pair and extract the desired time
ref_options = ['usd', 'usdt', 'usdc'] # Order of preference
for coin in coins:
coin_found = False
max_rows_avail = 0
for base in ref_options:
pair_check = coin.lower() + base
if pair_check in data:
# This trading pair exists. Check the duration
dur_avail = data[pair_check]['time'].iloc[-1] - data[pair_check]['time'].iloc[0]
rows_avail = len(data[pair_check])
max_rows_avail = max(max_rows_avail, rows_avail)
if rows_avail > num_hours:
# Sufficient duration. Go with it!
if verbose > 0:
print(f'[{coin}] Using trading pair {pair_check}')
# Extract the relevant rows and save
# copy() to avoid SettingWithCopyWarning error
this_df = data[pair_check].iloc[-num_hours:].copy()
this_df.name = coin
dfs.append(this_df)
coin_found = True
break
if not coin_found:
printmd(f'\n **ERROR!**', color="0xFF8888")
print(f'GetHourlyDf: No valid pair found for {coin} in file {self.filename}.')
print(f"max_rows_avail={max_rows_avail}. Wanted {num_hours} rows.")
raise
return dfs
#*******************************************************************************
def GetHourlyDfPairs(self, pairs, num_hours):
"""Grab the hourly data from file, as a list of DataFrames
Args:
filename (str): the file to load from
pairs (list): a list of pairs to load
num_hours (int): the number of hours (data points) for each coin
Returns:
list: A list of DataFrames. 1 per coin
"""
data = self.package['data']
# 'data' is a dictionary where the key indicates the trading pair
dfs = [] # Output is a list of dataframes
# For each coin, pick a pair and extract the desired time
for pair in pairs:
if not pair in data:
printmd(f'\n **ERROR!**', color="0xFF8888")
print(f'GetHourlyDfPairs: Pair {pair} not found in file {self.filename}.')
# Check the duration
dur_avail = data[pair]['time'].iloc[-1] - data[pair]['time'].iloc[0]
rows_avail = len(data[pair])
if rows_avail > num_hours:
# Sufficient duration. Go with it!
# Extract the relevant rows and save
# copy() to avoid SettingWithCopyWarning error
this_df = data[pair].iloc[-num_hours:].copy()
this_df.name = pair
dfs.append(this_df)
else:
printmd(f'\n **ERROR!**', color="0xFF8888")
print(f'GetHourlyDfPairs: Insufficient time available for pair {pair} in file {self.filename}.')
print(f"rows_avail={rows_avail}. Wanted {num_hours} rows.")
return dfs
# %%
#*******************************************************************************
#*******************************************************************************
#*******************************************************************************
# pycwatch
# For incremental data. Not used as of 2022-08-22
import pycwatch # https://github.com/iuvbio/pycwatch
# API docs are here: https://docs.cryptowat.ch/rest-api/markets/list
def GetHourlyDfCryptowatch(coins, numHours):
"""
Get hourly dataframe from Cryptowatch API (using pycwatch lib)
Returns a dataframe for each coin. Attempts to download numHours of data
for each coin. If that much data isn't available, then it will return
what it finds
Columns in output are close, high, low, volume
"""
t_diff_max = 40*24*60*60 # Seconds. Max of 1000 points, so I'll do only 40 days per call (960 hours)
dfs = []
t_now = int(time.time())
api = pycwatch.rest # create api client
api.api_key = cryptowatch_public
# list of available assets
#assets = api.list_assets()
#price = api.get_market_price(exchange, pair)
# Get all market info:
#all_markets = api.get_all_market_summaries()
# all_markets is a dictionary with ~7828 entries
# all_markets['binance:cakebtc'] =
# {'price': {'last': 0.0006343,
# 'high': 0.0006895,
# 'low': 0.0006132,
# 'change': {'percentage': -0.0031431714600031, 'absolute': -2e-06}},
# 'volume': 916706.98,
# 'volumeQuote': 590.520499377}
# Generate a list of every exchange, and the number of pairs on each
# This will be used to determine precedence
all_pairs = api.list_markets()
all_pairs_df = pd.DataFrame(all_pairs)
exch = all_pairs_df['exchange'].drop_duplicates()
pair_count = [sum(all_pairs_df['exchange']==e) for e in exch]
exch_precedence = dict(zip(list(exch),pair_count)) # exchanges with higher precedence will be use preferably
period_options = {\
'1m':60,
'3m':180,
'15m':900,
'30m':1800,
'1h':3600,
'2h':7200,
'4h':14400,
'6h':21600,
'12h':43200,
'1d':86400
# There are further opitions, but I won't use them
}
ohlc_col_headers = [
'time', # Unix time (secs since 1970)
'open', # price
'high', # price
'low', # price
'close', # price
'volume', # volume, in units of the base coin. Base = nominated coin. E.g. NANO in NANO-USD
'quote_volume' # volume, in units of the quoted coin. Quoted = reference coin. E.g. USD in NANO-USD
]
period_str = '1h'
period_int = period_options[period_str]
def FindDataFirstFinalTimes(exchange, pair_str, period_int):
# Find the first and final times available for this data pair
t_2010 = 1262332800 # Unix time for 2010
t_now = int(time.time())
# Get the first data point
try:
ohlc_resp = api.get_market_ohlc(exchange, pair_str, periods=period_int, after=t_2010)
except pycwatch.errors.APIError:
print(f'[{coin}] Exchange {exchange} with pair {pair_str} not found! Something went wrong')
raise
if len(ohlc_resp[period_str]) == 0:
return (t_now, t_now)
this_df = pd.DataFrame(ohlc_resp[period_str], columns = ohlc_col_headers)
first_t = this_df['time'].iloc[0]
# Get the final data point
ohlc_resp = api.get_market_ohlc(exchange, pair_str, periods=period_int, before=t_now)
this_df = pd.DataFrame(ohlc_resp[period_str], columns = ohlc_col_headers)
final_t = this_df['time'].iloc[-1]
return (first_t, final_t)
print('\n********************************************************')
printmd(f"## Get Crypto Data")
print(f"Getting {numHours} hours ({numHours/24:.2f} days)")
print(f"Starting from {time.ctime(t_now - numHours * 60 * 60)} until now")
for coin in coins:
printmd(f'### [{coin}] Getting data for {coin}')
# STEP 1: DECIDE ON THE PAIR & EXCHANGE
# The output of Step 1 is exchange, pair_Str, first_t, & final_t
asset_details = api.get_asset_details(coin) # Lists the details of every trading pair that the coin is involved with
pairs_df = pd.DataFrame(asset_details['markets']['base']) # All trading pairs where this coin is "base" currency
pairs_df['exch_score'] = pairs_df['exchange'].map(lambda e : exch_precedence[e] if e in exch_precedence else 0)
pairs_df.sort_values('exch_score', ascending=False)
# Decide on the exchange and coin pair
base_options = ['usdt', 'usd'] # Could expand on this to support BTC and ETH pair
option_count = 0
# Check the length of data available for every possible exchange & pair
pairs_df['first_t'] = int(0)
pairs_df['final_t'] = int(0)
pairs_df = pairs_df.astype({'exchange':str, 'exch_score':int, 'pair':str,'first_t':'int64', 'final_t':'int64'})
for base in base_options:
pair_str = coin.lower() + base
for pair_idx in range(len(pairs_df)):
if pairs_df['pair'][pair_idx] == pair_str:
# Found the pair. How long has it existed for?
(first_t, final_t) = FindDataFirstFinalTimes(pairs_df['exchange'][pair_idx], pair_str, period_int)
pairs_df.at[pair_idx, 'first_t'] = first_t
pairs_df.at[pair_idx, 'final_t'] = final_t
option_count += 1
#print(f"{pair_str:8} from {exchange:12} avail for {(final_t - first_t) / (3600*24):6.0f} days. Since {time.ctime(first_t)}")
if option_count == 0:
print(f'ERROR! No suitable pair found for coin {coin}')
continue
# Choose from the options
pairs_df['days'] = (pairs_df['final_t'] - pairs_df['first_t']) / (60*60*24)
t_range = [int(t_now - numHours * 3600), t_now] # The requested time range
pairs_df['time_coverage'] = (pairs_df['final_t'].map(lambda t : max(t_range[0], min(t_range[1], t))) - \
pairs_df['first_t'].map(lambda t : min(t_range[1], max(t_range[0], t)))\
+ pairs_df['exch_score']) / (t_range[1] - t_range[0])
# I've added 'exch score' to the time coverage such that if 2 exchanges have the same
# time coverage, the one with the higher score will be chosen
# Print a table of all of the candidate pairs & exchanges
printmd(f'[{coin}] **Exchange & pair comparison:**')
print(pairs_df[pairs_df['first_t'] != 0][['exchange','pair','days','time_coverage']].to_string(
formatters={'time_coverage':'{:6.3f}'.format, 'days':'{:7.2f}'.format}))
# Choose the best option
pair_idx = pairs_df['time_coverage'].idxmax()
exchange = pairs_df['exchange'][pair_idx]
pair_str = pairs_df['pair'][pair_idx]
first_t = pairs_df['first_t'][pair_idx]
final_t = pairs_df['final_t'][pair_idx]
# Print chosen pair & exchange
hours_avail = int((final_t - first_t) / 3600) + 1
printmd(f'[{coin}] Using pair **{pair_str}** from exchange **{exchange}**. {hours_avail/24:.0f} days of data available')
if hours_avail < numHours:
printmd(f'[{coin}] **{numHours/24:.0f} days requested, but only {hours_avail/24:.0f} days available!**', color="0xFF8888")
# STEP 2: DOWNLOAD THE DATA
# Collect the actual data
# Start time at the back and work forward
# unix time (second since 1970)
t = max(first_t, int(t_now - numHours * 3600))
printmd(f'\n[{coin}] **Downloading the data**')
print(f"[{coin}] Starting from {time.ctime(t)}")
df = pd.DataFrame()
conn_err_count = 0 # count of consecutive connection errors
while t < final_t:
t_start = t
time_span = min(t_diff_max, final_t - t)
point_count = round(time_span/3600)
t_end = t + time_span
print(f'[{coin}] Getting {point_count:4.0f} points, up to time {time.ctime(t_end)}')
#print(f'[{coin}] Getting {point_count:4.2f} points, {t_start} to {t_end}. Diff= {t_end - t_start}')
try:
ohlc_resp = api.get_market_ohlc(exchange, pair_str, periods=period_int, after=int(t_start), before=int(t_end))
except pycwatch.errors.APIError:
print(f'[{coin}] Exchange {exchange} with pair {pair_str} not found! Something went wrong')
success = False
raise
except: # [ConnectionError, ConnectionAbortedError]:
print(f'[{coin}] Request data error! Trying again')
conn_err_count += 1
if conn_err_count > 3:
print(f'[{coin}] Giving up')
success = False
break
continue
conn_err_count = 0
points_recv = len(ohlc_resp[period_str])
if points_recv == 0:
print(f'[{coin}] RECEIVED NO DATA POINTS!')
if points_recv > 0:
this_df = pd.DataFrame(ohlc_resp[period_str], columns = ohlc_col_headers)
this_df.set_index(inplace=True, keys=pd.DatetimeIndex(pd.to_datetime(this_df['time'],unit='s')))
this_df.index.name='datetime'
this_df.pop('quote_volume')
this_df.pop('open')
this_df.rename(columns={'volume':'volume_nom'}) # Volume in nominated currency
if points_recv != point_count:
print(f'[{coin}] Actually received {points_recv} / {point_count} points.')
#print(f'[{coin}] Actually received {points_recv} / {point_count} points. Up to {time.ctime( this_df['time'].iloc[-1] )}') !@#
if df.empty:
df = this_df
else:
df = pd.concat([df, this_df])
success = True
t = t_end
if success:
dfs.append(df)
print(f'[{coin}] download finished. {len(df)} rows (hours) total')
# # To save a data set:
# dateStr = datetime.now().strftime('%Y-%m-%d')
# filehandler = open(f'./indata/dfs_{len(dfs)}coins_{numHours}hours_{dateStr}.pickle', 'wb')
# package = {'dfs':dfs, 'coinList':r.coinList, 'numHours':numHours, 'dateStr':dateStr}
# pickle.dump(package, filehandler)
# filehandler.close()
return dfs
#%%
#*******************************************************************************
#*******************************************************************************
#*******************************************************************************
# READ & PICKLE KRAKEN CSV
# Downloaded from https://support.kraken.com/hc/en-us/articles/360047543791-Downloadable-historical-market-data-time-and-sales-
# Read in all hourly data
import pandas as pd
import numpy as np
import os
import time
from DataTypes import printmd
import pickle
def ReadKrakenCsv(csv_dir):
"""
READ & PICKLE KRAKEN CSV
First, download the full data dump of kraken data from:
https://support.kraken.com/hc/en-us/articles/360047124832-Downloadable-historical-OHLCVT-Open-High-Low-Close-Volume-Trades-data
Then, call this function to read the hourly CSVs into a dataframe, & pickle it.
All non-hourly files are currently ignored.
The intention is that the data file could then be updated as needed from cryptowatch.
This function also scrubs the data
"""
printmd('## Read Kraken CSV')
print(f'Loading CSVs from {csv_dir}')
#csv_dir = 'C:/Users/deanr/Desktop/temp/kraken_data/Kraken_OHLCVT'
data_hist = {}
timestep = int(3600)
time_modified = None
time_last_data = None
for file in os.listdir(os.fsencode(csv_dir)):
filename = os.fsdecode(file)
if filename.endswith("_60.csv"):
pair_str = filename.split('_')[0].lower()
# Kraken API uses 'XBT' instead of 'BTC'. Apply this change
pair_str = pair_str.replace('xbt','btc')
pair_df = pd.read_csv(os.path.join(csv_dir, filename), header=None, \
names=['time','open','high','low','close','volume_nom','trades'], \
dtype={'time':np.int64, 'trades':np.int64})
# volume_nom = volume in nominated currency. E.g. 'NANO' in 'NANO-USD' pair
pair_df.set_index(inplace=True, keys=pd.DatetimeIndex(pd.to_datetime(pair_df['time'], unit='s')))
pair_df.index.name = 'datetime'
pair_df.pop('open')
pair_df.pop('trades')
pair_df['filler'] = False # Indicates whether each row was generated to fill a gap
data_hist[pair_str] = pair_df
# print(os.path.join(directory, filename))
if time_modified is None:
time_modified = os.stat(os.path.join(csv_dir, filename)).st_mtime
if pair_str == 'ethusd':
time_last_data = pair_df['time'].iloc[-1]
continue
else:
continue
# Remove spotty starts and fill gaps in the data
data_hist, _, _ = DataScrubbing(data_hist, timestep)
# Add change_vs_market
total_market_change = CalcChangeVsMarket(data_hist)
# Save the data
date_str = pd.to_datetime(time_last_data, unit='s').strftime('%Y-%m-%d')
save_filename = f'./indata/{date_str}_price_data_60m.pickle'
filehandler = open(save_filename, 'wb')
package = {'data':data_hist, 'total_market_change':total_market_change, 'date_str':date_str, \
'time_saved':time.time(), 'time_last_data':time_last_data, 'time_kraken_modified':time_modified, \
'gaps_filled_until':time_last_data, 'timestep':timestep}
pickle.dump(package, filehandler)
filehandler.close()
print(f'Saved to {save_filename}')
print(f'DONE! Loaded & pickled CSV data for {len(data_hist)} pairs.')
#*******************************************************************************
# DEALING WITH GAPS IN DATA
from datetime import datetime
def RemoveSpottyStart(df, timestep, check_len = 24):
"""Removes initial data from a series, until there is a gapless block of data
of length [check_len].
Args:
df ([DataFrame]): The dataframe to modify
timestep ([int]): Expected timestep in seconds, e.g. 3600 for hour
check_len (int, optional): The number of required contiguous, gapless rows required to be considered 'valid'. Defaults to 24.
Returns:
[tuple]: (df, rows_removed). Returns an empty dataframe if all data is bad
"""
rows_initial = len(df.index)
t_ser = df[df['filler'] == False]['time']
delta_t = np.diff(t_ser)
# TODO check if I actually want convolution here, or autocorrelation
gap_sum = np.convolve(delta_t / timestep, np.ones([check_len,]), mode='valid')
first_good_idx = np.argmin(gap_sum)
if gap_sum[first_good_idx] < check_len*0.99:
printmd('# SOMETHING UNEXPECTED HAPPENED!')
print('Data gaps are smaller than expected. Are there time duplicates?')
elif gap_sum[first_good_idx] > check_len*1.01:
# There are no sections of data that pass the criteria
df = df[0:0]
return df, rows_initial
first_good_t = t_ser.iloc[first_good_idx]
df = df[df['time'] >= first_good_t]
rows_final = len(df.index)
rows_removed = rows_initial - rows_final
return df, rows_removed
#*******************************************************************************
def FillDataGaps(df, timestep):
"""Price data often has gaps in time. I want data to exist at all time points.
This function finds where gaps exist, then inserts new rows.
Price data filled in via interpolation.
Args:
df ([DataFrame]): DataFrame of stock data to process
timestep ([int]): Expected seconds between data points. e.g. 3600 for 1hr
Returns:
[tuple]: (df, total_gaps_filled)
"""
t_ser = df['time']
t = int(t_ser.iloc[0])
t -= t%timestep
t_end = t_ser.iloc[-1]
# Step through every expected time & record those that are missing
new_times = []
while t < t_end:
data_found = t in t_ser.values
if not data_found:
new_times.append(t)
t += timestep
if len(new_times) > 0:
newdf = pd.DataFrame(new_times, columns=['time'])
newdf.set_index(inplace=True, keys=pd.DatetimeIndex(pd.to_datetime(newdf['time'], unit='s')))
newdf.index.name='datetime'
# Assume volume is 0 during these gaps.
# All prices will be interpolated
newdf['volume_nom'] = 0
newdf['filler'] = True # All of these rows are filler rows
df = pd.concat([df, newdf])
df.sort_index(inplace=True)
df.interpolate(inplace=True) # Fill in price data with interpolations
return df, len(new_times)
#*******************************************************************************
def DataScrubbing(data, timestep):
"""Calls RemoveSpottyStart and FillDataGaps on each DataFrame
Args:
data ([list]): List of dataframes of stock data
timestep ([int]): Expected seconds between data points. e.g. 3600 for 1hr
Returns:
[tuple]: (data, total_gaps_filled)
"""
printmd("## Data scrubbing")
timestep = int(timestep)
printmd(f'timestep={timestep}. {len(data)} pairs')
total_gaps_filled = 0
total_rows_removed = 0
pairs_to_delete = []
for pair, df in data.items():
rows_initial = len(df.index)
if rows_initial < 100:
print(f"[{pair:9s}] Had {rows_initial:6} rows. Not enough! REMOVED PAIR")
pairs_to_delete.append(pair)
continue
df, rows_removed = RemoveSpottyStart(df, timestep, check_len=24)
total_rows_removed += rows_removed
if len(df.index) == 0:
# All data is invalid
print(f'[{pair:9s}] Had {rows_initial:6} rows. Too spotty! REMOVED PAIR')
pairs_to_delete.append(pair)
continue
df, rows_added = FillDataGaps(df, timestep)
if rows_added == 0 and rows_removed == 0:
print(f'[{pair:9s}] No gaps in data. Leaving as-in.')
else:
rows_final = len(df.index)
data[pair] = df
total_gaps_filled += rows_added
print(f'[{pair:9s}] Had {rows_initial:6} rows. Removed initial {rows_removed:6} spotty rows. Added {rows_added:6} rows to fill in gaps. Now {rows_final:6} rows.')
for pair in pairs_to_delete:
del data[pair]
return data, total_rows_removed, total_gaps_filled
#*******************************************************************************
def FileDataScrubbing(filename):
"""Loads data file, calls FillDataGaps, saves file
Args:
filename ([str]): The file to load & save
"""
filehandler = open(filename, 'rb')
package = pickle.load(filehandler)
data = package['data']
filehandler.close()
data, total_rows_removed, total_gaps_filled = DataScrubbing(data, int(package['timestep']))
if total_rows_removed == 0 and total_gaps_filled == 0:
print('No rows removed or gaps filled')
else:
package['gaps_filled_until'] = package['time_last_data']
package['data'] = data
filehandler = open(filename, 'wb')
pickle.dump(package, filehandler)
filehandler.close()
print(f'Saved to {filename}')
#*******************************************************************************
def PlotGaps(data, pair='ethusd'):
"""Plots gap frequency for information & exploration purposes
to assess data quality
Args:
data ([list]): list of dataframes
pair (str, optional): Defaults to 'ethusd'.
"""
import matplotlib.pyplot as plt
delta_t = np.diff(data[pair]['time'])
from collections import Counter
cntr = Counter(delta_t)
total_cnt = len(delta_t)
del cntr[3600] # Remove the 1 hour
if len(cntr) == 0:
print(f'[{pair}] has no gaps')
return
#ax = plt.plot([int(i)/3600 for i in cntr.keys()], [int(i) for i in cntr.values()], 'x')
fig, ax = plt.subplots()
ax.plot([int(i)/3600 for i in cntr.keys()], [int(i)/total_cnt * 100 for i in cntr.values()], 'x')
ax.set(xlabel='Gap [hours]', ylabel='Occurrence [%]', title=f'Time gaps in {pair}')
#ax.set_yscale('log')
ax.grid()
# ******************************************************************************
# Comparing performance to market
# Adds the fields 'volume_usd', 'market_volume_fraction', and 'change_vs_market'
# volume_usd is the hourly volume, in USD
# market_volume_fraction is market cap of each token relative to the
# overall crypto market. Also called 'dominance'. Smoothed.
# change_vs_market is how the token performed compared to the overall crypto
# market, on an hourly basis. Each value is a ratio, so call .product() over
# a range to find the total performance difference over that range.
#
# Also returns market_ratio, which indicates how the overall market performed
# on an hourly basis.
#
# I can't easily get the market cap of each coin. Can't even easily get the
# overall crypto market cap. So, instead of weighting coins by their market cap,
# I weight coins by their volume in USD
def CalcChangeVsMarket(data):
fiat = ['usd', 'eur', 'cad', 'aud', 'gbp', 'chf', 'jpy'] # All fiat currencies
avg_len = 24 * 30 # hours. Used for smoothing for calculating the 'dominance' of each token
paired_to_usd = set()
usd_pairs = set()
for pair in data.keys():
nom, ref = GetPairNomRef(pair)
if ref == 'usd':
usd_pairs.add(pair)
paired_to_usd.add(nom)
# Calculate volume_usd (price * vol) for all USD pairs
# volume_usd is always in USD
for pair in usd_pairs:
df = data[pair]
df['volume_usd'] = df['close'] * df['volume_nom']
# print(f"added {pair:8s} len={len(df['volume_usd'])}")
# Find pairs that don't include USD, but can be converted to USD
# This is to make sure that I account for all (most) of the volume
# This will add pairs using EUR, BTC, USDT, etc.
for pair, df in data.items():
nom, ref = GetPairNomRef(pair)
if ref in paired_to_usd:
# print(f"Extra pair {pair:8s}")
df_refusd = data[ref + 'usd'] # Should always succeed, because paired_to_usd is known
idx_common = df.index.intersection(df_refusd.index)
df['volume_usd'] = (df['close'] * df['volume_nom']).multiply(df_refusd['close'][idx_common])
# Add volume_usd to reference currency USD pair
df_refusd['volume_usd'] = df_refusd['volume_usd'].add(df['volume_usd'], fill_value=0)
# Add volume_usd to nominated currency USD pair (if it exists)
if nom + 'usd' in data.keys():
df_nomusd = data[nom + 'usd']
df_nomusd['volume_usd'] = df_nomusd['volume_usd'].add(df['volume_usd'], fill_value=0)
# Smooth and sum to get total volume in USD
ser_volume_usd = pd.Series(dtype='float64')
for pair, df in data.items():
if 'volume_usd' in df.columns:
# I'm doing ".sum() / avg_len" instead of ".mean()"", because there's different behaviour
# when fewer than avg_len data points are available. ".sum() / avg_len" will rise from 0.
df['volume_usd'] = df['volume_usd'].rolling(avg_len, min_periods=1).sum() * 1/avg_len
nom, ref = GetPairNomRef(pair)
if (pair in usd_pairs) and (nom not in fiat): # Include only tokens (exclude fiat)
ser_volume_usd = ser_volume_usd.add(df['volume_usd'], fill_value=0)
# Scale each volume_usd by the total
# This will be used as a proxy for % market cap
for pair in usd_pairs:
df = data[pair]
df['market_volume_fraction'] = df['volume_usd'] / ser_volume_usd[df.index]
# Calculate change_vs_market
# Calculate a weighted average for the entire market: average the % change in 'close', weighted by the market_volume_fraction.
# Note that it's not smoothed.
# The 'market' is the crypto market, denominated in USD.
# So comparing pairs that aren't referenced in fiat (e.g. 'nanobtc') vs the 'market' likely isn't insightful.
# 'change_vs_market' is a series of ratios. To compare 2 dates, perform a product() of all values between them.
# First, calculate the market performance
ser_sum = pd.Series(dtype='float64')
ser_count = pd.Series(dtype='float64')
for pair in usd_pairs:
nom, ref = GetPairNomRef(pair)
if nom not in fiat: # Include only tokens (exclude fiat)
df = data[pair]
diff = df['close'].pct_change()
diff.iloc[0] = 0.
ser_sum = ser_sum.add(diff * df['market_volume_fraction'], fill_value=0)
ser_count = ser_count.add(df['market_volume_fraction'], fill_value=0)
ser_market_diff = ser_sum / ser_count
ser_market_ratio = ser_market_diff + 1. # Ratios of changes
# For each coin, for each time step: calculate the performance vs the market
for pair, df in data.items():
idx_common = df.index.intersection(ser_market_ratio.index)
ratio = df['close'].pct_change() + 1
ratio.iloc[0] = 1.
df['change_vs_market'] = ratio.divide(ser_market_ratio[idx_common])
print("Calculated market_volume_fraction and change_vs_market")
# I COULD use unit-less absolute values instead of ratios
# df['change_vs_market'].cumprod() gives the unit-less absolute values
# Ratios vs absolute values
# df['change_vs_market'].cumprod()
# is the same as
# df['close'] / ser_market_ratio.cumprod()
# df['change_vs_market'][20:25].prod()
# is the same as
# a[24] / a[20], where: a = df['close'] / ser_market_ratio.cumprod();
return ser_market_ratio
# ******************************************************************************
# GetPairNomRef
# Returns the pair string split into (nominated, reference)
# Usage: nom, ref = GetPairNomRef(pair)
# Other terminology: base=nominated. Quoted=reference.
# Pair order is nominated-reference. E.g. in NANO-USD, NANO=nominated & USD=reference.
def GetPairNomRef(pair):
# Logic: all 'reference' pairs are 3 character long, except for usdt and usdc
if pair[-4:] in ['usdc', 'usdt']:
return (pair[:-4], pair[-4:])
else:
return (pair[:-3], pair[-3:])
#%%
#*******************************************************************************
# Testing
if __name__ == '__main__':
os.chdir(os.path.dirname(os.path.dirname(__file__)))
print(f'Working directory is "{os.getcwd()}"')
#filename = './indata/2021-09-30_price_data_60m.pickle'
#dfs = GetHourlyDf(filename, ['ETH'], 100)
ReadKrakenCsv('C:/Users/deanr/Desktop/temp/kraken_data/Kraken_OHLCVT 2022 Q2/')
#%%
if 0:
# Example code to run PlotGaps, or DataScrubbing
# 2021-12-26
os.chdir(os.path.dirname(os.path.dirname(__file__)))
filename = './indata/2021-09-30_price_data_60m.pickle'
filehandler = open(filename, 'rb')
package = pickle.load(filehandler)
data = package['data']
filehandler.close()
#PlotGaps(data, 'ethusd')
#data, _, _ = DataScrubbing(data, int(package['timestep']))
# %%