-
Notifications
You must be signed in to change notification settings - Fork 10
/
utils.py
137 lines (114 loc) · 4.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# utils.py: Utility functions for the Canadian COVID-19 Data Archive #
# https://github.com/ccodwg/Covid19CanadaArchive #
# Maintainer: Jean-Paul R. Soucy #
# import modules from Python standard library
import sys
import json
import tempfile
import requests
import sqlite3
### list_inactive_datasets: List datasets that have not been updated for an unusually long time ###
def list_inactive_datasets():
## import modules
import pandas as pd
pd.options.mode.chained_assignment = None # disable chained assignment warning
## load active datasets and extract active UUIDs as list
with open('datasets.json') as json_file:
datasets = json.load(json_file)
datasets = datasets['active'] # subset active datasets
ds = {}
for d in datasets:
for i in range(len(datasets[d])):
ds[datasets[d][i]["uuid"]] = datasets[d][i]
## download file index and read into dataframe
no_cache_headers = {"Cache-Control": "no-cache", "Pragma": "no-cache"}
temp = tempfile.NamedTemporaryFile()
with open(temp.name, "wb") as f:
f.write(requests.get("https://data.opencovid.ca/archive/index.db", headers = no_cache_headers).content)
ind = pd.read_sql("SELECT * FROM archive", sqlite3.connect(temp.name))
## filter out datasets already marked as inactive
ind = ind[ind["uuid"].isin(ds.keys())]
## filter to final file from each date
ind["file_final_for_date"] = ind.groupby(["uuid", "file_date"])["file_timestamp"].transform("max") == ind["file_timestamp"]
ind = ind[ind["file_final_for_date"] == True]
## join datasets metadata
datasets_meta = pd.DataFrame.from_dict(ds, orient = "index")[["uuid", "dir_parent", "dir_file"]]
ind = pd.merge(ind, datasets_meta, on = "uuid", how = "left")
## order datasets by UUID and timestamp
ind = ind.sort_values(by = ["uuid", "file_timestamp"]).reset_index(drop = True)
## add name column
ind["name"] = ind["dir_parent"] + "/" + ind["dir_file"]
## list to hold results
log = []
## for each UUID, calculate a cumulative sum for file_duplicate, resetting at each 0 (i.e., each new file)
## then, see if the current run of duplicates is longer than all previous runs of duplicates
for uuid in ind["uuid"].unique():
# filter data
u = ind[ind["uuid"] == uuid]
# calculate runs of 1s (i.e., consecutive duplicates)
u["file_duplicate"] = u["file_duplicate"] != 0
d = u["file_duplicate"]
u["file_duplicate"] = d.cumsum()-d.cumsum().where(~d).ffill().fillna(0).astype(int)
# get last value
last = u["file_duplicate"].iloc[-1]
# get max value excluding final run
# filter to everything before final zero
final_zero = u[u["file_duplicate"] == 0]
if len(final_zero) == 0:
# every entry but the first is a duplicate
max = 0
else:
final_zero = final_zero.index[-1]
## filter to everything before final zero
u = u[u.index.isin(range(final_zero))]
max = u["file_duplicate"].max()
# if last value is larger than max and at least 7, add to log
if last > max and last >= 7:
log.append([u["name"].iloc[-1], u["uuid"].iloc[-1], last, max])
## save result and sort
log = pd.DataFrame(log, columns = ["name", "uuid", "current_dup_run", "previous_max_dup_run"]).sort_values(by = ["current_dup_run", "name"], ascending = [False, True])
log = log.to_string(index=False)
## print result
print(log)
### retire_datasets: Move dataset(s) from 'active' to 'inactive'
def retire_datasets(uuids):
# load datasets.json
with open('datasets.json') as json_file:
datasets = json.load(json_file)
# loops through UUIDs
for uuid in uuids:
# find location of dataset
i_uuid = None
for k in datasets['active'].keys():
for i in range(len(datasets['active'][k])):
u = datasets['active'][k][i]['uuid']
if u == uuid:
i_uuid = i
k_uuid = k
break
else:
continue
break
# stop if no matching active dataset found
if i_uuid is None:
print('No matching active dataset found with UUID: ' + u)
else:
# copy dataset
d = datasets['active'][k_uuid][i_uuid]
# add inactive flag
d['active'] = 'False'
# remove dataset from active
datasets['active'][k_uuid].pop(i_uuid)
# append dataset to inactive
datasets['inactive'][k_uuid].append(d)
# write datasets.json
with open('datasets.json', 'w') as json_file:
json.dump(datasets, json_file, indent=2, ensure_ascii=False)
# run utility functions from command line by calling them by name
if __name__ == '__main__':
# no positional argument
if len(sys.argv) == 2:
globals()[sys.argv[1]]()
# one or more positional arguments
elif len(sys.argv) > 2:
globals()[sys.argv[1]](sys.argv[2:len(sys.argv)])