-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtest-nursery.py
309 lines (237 loc) · 10.3 KB
/
test-nursery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
ACRO Tests.
Copyright : Maha Albashir, Richard Preen, Jim Smith 2023.
"""
# import libraries
import os
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
from acro import ACRO, add_constant
# Instantiate ACRO by making an acro object
print(
"\n Creating an acro object().\n"
"The TRE's risk appetite is read from default.yml\n"
"and shown to the researcher and output checker"
)
acro = ACRO()
# Load test data
# The dataset used in this notebook is the nursery dataset from OpenML.
# - The dataset can be read directly from OpenML using the code commented in the next cell.
# - In this version, it can be read directly from the local machine
# if it has been downloaded.
# - The code below reads the data from a folder called "data"
# which we assume is at the same level as the folder where you are working.
# - The path might need to be changed if the data has been downloaded and stored elsewhere.
# - for example use:
# path = os.path.join("data", "nursery.arff")
# if the data is in a sub-folder of your work folder
# commented out version to load from web
# from sklearn.datasets import fetch_openml
# data = fetch_openml(data_id=26, as_frame=True)
# df = data.data
# df["recommend"] = data.target
# Version to load data from local directory
path = os.path.join("../data", "nursery.arff")
data = loadarff(path)
df = pd.DataFrame(data[0])
df = df.select_dtypes([object])
df = df.stack().str.decode("utf-8").unstack()
df.rename(columns={"class": "recommend"}, inplace=True)
print("\n Data loaded, these are the first five rows")
print(df.head())
# Convert 'more than 3' children to random between 4 and 10
# Change the children column from categorical to numeric
# in order to be able to test some of the ACRO functions that require a numeric feature
print("\nChanging number of children to integer type")
df["children"].replace(to_replace={"more": "4"}, inplace=True)
df["children"] = pd.to_numeric(df["children"])
df["children"] = df.apply(
lambda row: (
row["children"] if row["children"] in (1, 2, 3) else np.random.randint(4, 10)
),
axis=1,
)
# Examples of producing tabular output
# We rely on the industry-standard package **pandas** for tabulating data.
# In the next few examples we show:
# - first, how a researcher would normally make a call in pandas,
# saving the results in a variable that they can view on screen (or save to file?)
# - then how the call is identical in SACRO, except that:
# - "pd" is replaced by "acro"
# - the researcher immediately sees TRE output checking recommendations.
print(
"\nThe first set of examples show acro wrappers around "
" standard tabulation routines from the pandas package."
)
# Pandas crosstab
# This is an example of crosstab using pandas.
# We first make the call, then the second line print the outputs to wscreen.
print("\nCalling crosstab of recommendation by parents using pandas")
table = pd.crosstab(df.recommend, df.parents)
print(table)
# ACRO crosstab
# This is an example of crosstab using ACRO.
# The INFO lines show the researcher what will be reported to the output checkers.
# Then the (suppressed as necessary) table is shown via. the print command as before.
print("\nNow the same crosstab call using the ACRO interface")
safe_table = acro.crosstab(df.recommend, df.parents)
print("\nand this is the researchers output")
print(safe_table)
# ACRO crosstab with suppression
# This is an example of crosstab with suppressing the cells that violate the discloure tests.
# Note that you need to change the value of the suppress variable in the acro object to True.
# Then run the crosstab command.
print("\nTurn on the suppression variable")
acro.suppress = True
print("\nNow the same crosstab call using the ACRO interface")
safe_table = acro.crosstab(df.recommend, df.parents)
print("\nand this is the researchers output with suppression")
print(safe_table)
print("\nNow turn off the suppression variable")
acro.suppress = False
# ACRO crosstab with aggregation function
# Mean() in this case
# Then how Max and Min are not allowed by the code
print("\nIllustration of crosstab using an aggregation function - mean in this case.")
safe_table = acro.crosstab(df.recommend, df.parents, values=df.children, aggfunc="mean")
print("\nand this is the researchers output")
print(safe_table)
print(
"\nThis is what happens if you try to get max values for a cell."
"\nSo that this script runs on one go, we've caught the exception "
"thrown by ACRO."
)
try:
safe_table = acro.crosstab(
df.recommend, df.parents, values=df.children, aggfunc="max"
)
except ValueError as e:
print("ValueError:")
print(e)
# ACRO pivot_table
# This is an example of pivot table using ACRO.
# - Some researchers may prefer this to using crosstab.
# - Again the call syntax is identical to the pandas "pd.pivot_table"
# - in this case the output is non-disclosive
print("\nIllustration of using the acro version of pandas pivot table")
table = acro.pivot_table(
df, index=["parents"], values=["children"], aggfunc=["mean", "std"]
)
print("\nand this is the researchers output")
print(table)
# Regression examples using ACRO
# Again there is an industry-standard package in python, this time called **statsmodels**.
# - The examples below illustrate the use of the ACRO wrapper standard statsmodel functions
# - Note that statsmodels can be called using an 'R-like' format
# (using an 'r' suffix on the command names)
# - most statsmodels functions return a "results object",
# which has a "summary" function that produces printable/saveable outputs
print(
"\nThe next set of examples illustrate acro wrappers "
"around functions from the statsmodels package"
)
# Start by manipulating the nursery data to get two numeric variables
# - The 'recommend' column is converted to an integer scale
df["recommend"].replace(
to_replace={
"not_recom": "0",
"recommend": "1",
"very_recom": "2",
"priority": "3",
"spec_prior": "4",
},
inplace=True,
)
df["recommend"] = pd.to_numeric(df["recommend"])
new_df = df[["recommend", "children"]]
new_df = new_df.dropna()
# ACRO OLS
# This is an example of ordinary least square regression using ACRO.
# - Above recommend column was converted form categorical to numeric.
# - Now we perform a the linear regression between recommend and children.
# - This version includes a constant (intercept)
# - This is just to show how the regression is done using ACRO.
# - **No correlation is expected to be seen by using these variables**
y = new_df["recommend"]
x = new_df["children"]
x = add_constant(x)
print("\nOrdinary Least Squares Regression")
results = acro.ols(y, x)
print("\nand this is the researchers output")
print(results.summary())
# ACRO OLSR
# This is an example of ordinary least squares regression using the 'R-like' statsmodels api,
# i.e. from a formula and dataframe using ACRO
print("\nAnd same, but passing a formula instead of two arrays")
results = acro.olsr(formula="recommend ~ children", data=new_df)
print("\nand this is the researchers output")
print(results.summary())
# ACRO Probit
# This is an example of probit regression using ACRO
# We use a different combination of variables from the original dataset.
new_df = df[["finance", "children"]]
new_df = new_df.dropna()
y = new_df["finance"].astype("category").cat.codes # numeric
y.name = "finance"
x = new_df["children"]
x = add_constant(x)
print("\n Example of a probit regression")
results = acro.probit(y, x)
print("\nand this is the researchers output")
print(results.summary())
# ACRO Logit
# This is an example of logistic regression using ACRO using the statmodels function
print("\n Example of a logit regression")
results = acro.logit(y, x)
print("\nand this is the researchers output")
print(results.summary())
# ACRO functionality to let users manage their outputs
#
# 1: List current ACRO outputs
# This is an example of using the print_output function to list all the outputs created so far
print("\nNow illustrating how users can manage their outputs")
print(
"\nStart by listing the outputs in the acro memory."
"For each output the key line is the one starting 'Summary'"
)
acro.print_outputs()
# 2: Remove some ACRO outputs before finalising
# This is an example of deleting some of the ACRO outputs.
# The name of the output to be removed should be passed to the function remove_output.
# - Currently, all outputs names contain timestamp;
# that is the time when the output was created.
# - The output name can be taken from the outputs listed by the print_outputs function,
# - or by listing the results and choosing the specific output that needs to be removed
print("\nNow removing two disclosive outputs")
acro.remove_output("output_1")
acro.remove_output("output_4")
# 3: Rename ACRO outputs before finalising
# This is an example of renaming the outputs to provide a more descriptive name.
# The timestamp associated with the output name will not get overwritten
print("\nUsers can rename output files to something more informative")
acro.rename_output("output_2", "pivot_table")
# 4: Add a comment to output
# This is an example to add a comment to outputs.
# It can be used to provide a description
# or to pass additional information to the output checkers.
print("\nUsers can add comments which the output checkers will see.")
acro.add_comments("output_0", "Please let me have this table!")
acro.add_comments("output_0", "6 cells were suppressed in this table")
# 5: Add an unsupported output to the list of outputs
# This is an example to add an unsupported outputs (such as images) to the list of outputs
print("\nUsers can add files produced by an analysis aCRO doesn't cover")
acro.custom_output(
"XandY.jpeg", "This output is an image showing the relationship between X and Y"
)
# 6 (the big one) Finalise ACRO
# This is an example of the function _finalise()_
# which the users must call at the end of each session.
# - It takes each output and saves it to a CSV file.
# - It also saves the SDC analysis for each output to a json file or Excel file
# (depending on the extension of the name of the file provided as an input to the function)
print(
"\nUsers MUST call finalise to send their outputs to the checkers"
" If they don't, the SDC analysis, and their outputs, are lost."
)
output = acro.finalise("RES_TEST", "json")