Skip to content

Commit f1b1e37

Browse files
authored
Merge pull request #114 from AI-SDC/113_modify_json_schema
modify output JSON to be in a file-oriented schema
2 parents 7d7c561 + aae3f9d commit f1b1e37

File tree

10 files changed

+936
-807
lines changed

10 files changed

+936
-807
lines changed

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ repos:
8080
- --exclude=__init__.py
8181
- --ignore=E203,W503
8282
additional_dependencies: [flake8-bugbear, pep8-naming]
83+
exclude: "docs"
8384

8485
# Check types with mypy
8586
- repo: https://github.com/pre-commit/mirrors-mypy

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Changes:
77
* Refactor load_json Records class function to static load_results ([#110](https://github.com/AI-SDC/ACRO/pull/110))
88
* Write SDC parameters to config.json upon finalise ([#111](https://github.com/AI-SDC/ACRO/pull/111))
99
* Add explicit exception handling and finalise prompt ([#112](https://github.com/AI-SDC/ACRO/pull/112))
10+
* Add version number to JSON and use new schema ([#114](https://github.com/AI-SDC/ACRO/pull/114))
1011

1112
## Version 0.3.0 (Jul 04, 2023)
1213

acro/acro.py

+16-10
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,10 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals
271271
mask.replace({0: False, 1: True}, inplace=True)
272272
masks[name] = mask
273273

274-
# build the properties dictionary
275-
properties: dict = {"method": "crosstab", "suppressed": self.suppress}
276-
utils.update_table_properties(masks, properties)
274+
# build the sdc dictionary
275+
sdc: dict = utils.get_table_sdc(masks, self.suppress)
277276
# get the status and summary
278-
status, summary = utils.get_summary(properties)
277+
status, summary = utils.get_summary(sdc)
279278
# apply the suppression
280279
safe_table, outcome = utils.apply_suppression(table, masks)
281280
if self.suppress:
@@ -284,7 +283,8 @@ def crosstab( # pylint: disable=too-many-arguments,too-many-locals
284283
self.results.add(
285284
status=status,
286285
output_type="table",
287-
properties=properties,
286+
properties={"method": "crosstab"},
287+
sdc=sdc,
288288
command=command,
289289
summary=summary,
290290
outcome=outcome,
@@ -410,11 +410,10 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals
410410
data, values, index, columns, aggfunc=agg
411411
)
412412

413-
# build the properties dictionary
414-
properties: dict = {"method": "pivot_table", "suppressed": self.suppress}
415-
utils.update_table_properties(masks, properties)
413+
# build the sdc dictionary
414+
sdc: dict = utils.get_table_sdc(masks, self.suppress)
416415
# get the status and summary
417-
status, summary = utils.get_summary(properties)
416+
status, summary = utils.get_summary(sdc)
418417
# apply the suppression
419418
safe_table, outcome = utils.apply_suppression(table, masks)
420419
if self.suppress:
@@ -423,7 +422,8 @@ def pivot_table( # pylint: disable=too-many-arguments,too-many-locals
423422
self.results.add(
424423
status=status,
425424
output_type="table",
426-
properties=properties,
425+
properties={"method": "pivot_table"},
426+
sdc=sdc,
427427
command=command,
428428
summary=summary,
429429
outcome=outcome,
@@ -503,6 +503,7 @@ def ols( # pylint: disable=too-many-locals
503503
status=status,
504504
output_type="regression",
505505
properties={"method": "ols", "dof": dof},
506+
sdc={},
506507
command=command,
507508
summary=summary,
508509
outcome=DataFrame(),
@@ -566,6 +567,7 @@ def olsr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
566567
status=status,
567568
output_type="regression",
568569
properties={"method": "olsr", "dof": dof},
570+
sdc={},
569571
command=command,
570572
summary=summary,
571573
outcome=DataFrame(),
@@ -614,6 +616,7 @@ def logit( # pylint: disable=too-many-arguments,too-many-locals
614616
status=status,
615617
output_type="regression",
616618
properties={"method": "logit", "dof": dof},
619+
sdc={},
617620
command=command,
618621
summary=summary,
619622
outcome=DataFrame(),
@@ -677,6 +680,7 @@ def logitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
677680
status=status,
678681
output_type="regression",
679682
properties={"method": "logitr", "dof": dof},
683+
sdc={},
680684
command=command,
681685
summary=summary,
682686
outcome=DataFrame(),
@@ -725,6 +729,7 @@ def probit( # pylint: disable=too-many-arguments,too-many-locals
725729
status=status,
726730
output_type="regression",
727731
properties={"method": "probit", "dof": dof},
732+
sdc={},
728733
command=command,
729734
summary=summary,
730735
outcome=DataFrame(),
@@ -788,6 +793,7 @@ def probitr( # pylint: disable=too-many-locals,keyword-arg-before-vararg
788793
status=status,
789794
output_type="regression",
790795
properties={"method": "probitr", "dof": dof},
796+
sdc={},
791797
command=command,
792798
summary=summary,
793799
outcome=DataFrame(),

acro/record.py

+45-16
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import pandas as pd
1313
from pandas import DataFrame
1414

15+
from .version import __version__
16+
1517
logger = logging.getLogger("acro:records")
1618

1719

@@ -67,6 +69,8 @@ class Record: # pylint: disable=too-many-instance-attributes,too-few-public-met
6769
Type of output, e.g., "regression"
6870
properties : dict
6971
Dictionary containing structured output data.
72+
sdc : dict
73+
Dictionary containing SDC results.
7074
command : str
7175
String representation of the operation performed.
7276
summary : str
@@ -89,6 +93,7 @@ def __init__( # pylint: disable=too-many-arguments
8993
status: str,
9094
output_type: str,
9195
properties: dict,
96+
sdc: dict,
9297
command: str,
9398
summary: str,
9499
outcome: DataFrame,
@@ -107,6 +112,8 @@ def __init__( # pylint: disable=too-many-arguments
107112
Type of output, e.g., "regression"
108113
properties : dict
109114
Dictionary containing structured output data.
115+
sdc : dict
116+
Dictionary containing SDC results.
110117
command : str
111118
String representation of the operation performed.
112119
summary : str
@@ -122,6 +129,7 @@ def __init__( # pylint: disable=too-many-arguments
122129
self.status: str = status
123130
self.output_type: str = output_type
124131
self.properties: dict = properties
132+
self.sdc: dict = sdc
125133
self.command: str = command
126134
self.summary: str = summary
127135
self.outcome: DataFrame = outcome
@@ -180,6 +188,7 @@ def __str__(self) -> str:
180188
f"status: {self.status}\n"
181189
f"type: {self.output_type}\n"
182190
f"properties: {self.properties}\n"
191+
f"sdc: {self.sdc}\n"
183192
f"command: {self.command}\n"
184193
f"summary: {self.summary}\n"
185194
f"outcome: {self.outcome}\n"
@@ -203,6 +212,7 @@ def add( # pylint: disable=too-many-arguments
203212
status: str,
204213
output_type: str,
205214
properties: dict,
215+
sdc: dict,
206216
command: str,
207217
summary: str,
208218
outcome: DataFrame,
@@ -219,6 +229,8 @@ def add( # pylint: disable=too-many-arguments
219229
Type of output, e.g., "regression"
220230
properties : dict
221231
Dictionary containing structured output data.
232+
sdc : dict
233+
Dictionary containing SDC results.
222234
command : str
223235
String representation of the operation performed.
224236
summary : str
@@ -235,6 +247,7 @@ def add( # pylint: disable=too-many-arguments
235247
status=status,
236248
output_type=output_type,
237249
properties=properties,
250+
sdc=sdc,
238251
command=command,
239252
summary=summary,
240253
outcome=outcome,
@@ -317,6 +330,7 @@ def add_custom(self, filename: str, comment: str | None = None) -> None:
317330
status="review",
318331
output_type="custom",
319332
properties={},
333+
sdc={},
320334
command="custom",
321335
summary="review",
322336
outcome=DataFrame(),
@@ -437,17 +451,22 @@ def finalise_json(self, path: str) -> None:
437451
"status": val.status,
438452
"type": val.output_type,
439453
"properties": val.properties,
454+
"files": [],
455+
"outcome": json.loads(val.outcome.to_json()),
440456
"command": val.command,
441457
"summary": val.summary,
442-
"outcome": json.loads(val.outcome.to_json()),
443-
"output": val.serialize_output(path),
444458
"timestamp": val.timestamp,
445459
"comments": val.comments,
446460
"exception": val.exception,
447461
}
462+
files: list[str] = val.serialize_output(path)
463+
for file in files:
464+
outputs[key]["files"].append({"name": file, "sdc": val.sdc})
465+
466+
results: dict = {"version": __version__, "results": outputs}
448467
filename: str = os.path.normpath(f"{path}/results.json")
449-
with open(filename, "w", newline="", encoding="utf-8") as file:
450-
json.dump(outputs, file, indent=4, sort_keys=False)
468+
with open(filename, "w", newline="", encoding="utf-8") as handle:
469+
json.dump(results, handle, indent=4, sort_keys=False)
451470

452471
def finalise_excel(self, path: str) -> None:
453472
"""Writes outputs to an excel spreadsheet.
@@ -536,19 +555,29 @@ def load_records(path: str) -> Records:
536555
"""
537556
records = Records()
538557
filename = os.path.normpath(f"{path}/results.json")
539-
with open(filename, newline="", encoding="utf-8") as file:
540-
data = json.load(file)
541-
for key, val in data.items():
558+
with open(filename, newline="", encoding="utf-8") as handle:
559+
data = json.load(handle)
560+
if data["version"] != __version__: # pragma: no cover
561+
raise ValueError("error loading output")
562+
for key, val in data["results"].items():
563+
files: list[dict] = val["files"]
564+
filenames: list = []
565+
sdcs: list = []
566+
for file in files:
567+
filenames.append(file["name"])
568+
sdcs.append(file["sdc"])
542569
records.results[key] = Record(
543-
val["uid"],
544-
val["status"],
545-
val["type"],
546-
val["properties"],
547-
val["command"],
548-
val["summary"],
549-
load_outcome(val["outcome"]),
550-
load_output(path, val["output"]),
551-
val["comments"],
570+
uid=val["uid"],
571+
status=val["status"],
572+
output_type=val["type"],
573+
properties=val["properties"],
574+
sdc=sdcs[0],
575+
command=val["command"],
576+
summary=val["summary"],
577+
outcome=load_outcome(val["outcome"]),
578+
output=load_output(path, filenames),
579+
comments=val["comments"],
552580
)
581+
records.results[key].exception = val["exception"]
553582
records.results[key].timestamp = val["timestamp"]
554583
return records

acro/utils.py

+30-28
Original file line numberDiff line numberDiff line change
@@ -218,44 +218,45 @@ def apply_suppression(
218218
return safe_df, outcome_df
219219

220220

221-
def update_table_properties(masks: dict[str, DataFrame], properties: dict) -> None:
222-
"""Updates the properties dictionary using the suppression masks.
221+
def get_table_sdc(masks: dict[str, DataFrame], suppress: bool) -> dict:
222+
"""Returns the SDC dictionary using the suppression masks.
223223
224224
Parameters
225225
----------
226226
masks : dict[str, DataFrame]
227227
Dictionary of tables specifying suppression masks for application.
228-
properties : dict
229-
Properties of the SDC checks.
228+
suppress : bool
229+
Whether suppression has been applied.
230230
"""
231231
# summary of cells to be suppressed
232-
properties["negative"] = 0
233-
properties["missing"] = 0
234-
properties["threshold"] = 0
235-
properties["p-ratio"] = 0
236-
properties["nk-rule"] = 0
232+
sdc: dict = {"summary": {"suppressed": suppress}, "cells": {}}
233+
sdc["summary"]["negative"] = 0
234+
sdc["summary"]["missing"] = 0
235+
sdc["summary"]["threshold"] = 0
236+
sdc["summary"]["p-ratio"] = 0
237+
sdc["summary"]["nk-rule"] = 0
237238
for name, mask in masks.items():
238-
properties[name] = int(mask.to_numpy().sum())
239+
sdc["summary"][name] = int(mask.to_numpy().sum())
239240
# positions of cells to be suppressed
240-
properties["sdc"] = {}
241-
properties["sdc"]["negative"] = []
242-
properties["sdc"]["missing"] = []
243-
properties["sdc"]["threshold"] = []
244-
properties["sdc"]["p-ratio"] = []
245-
properties["sdc"]["nk-rule"] = []
241+
sdc["cells"]["negative"] = []
242+
sdc["cells"]["missing"] = []
243+
sdc["cells"]["threshold"] = []
244+
sdc["cells"]["p-ratio"] = []
245+
sdc["cells"]["nk-rule"] = []
246246
for name, mask in masks.items():
247247
true_positions = np.column_stack(np.where(mask.values))
248248
for pos in true_positions:
249249
row_index, col_index = pos
250-
properties["sdc"][name].append([int(row_index), int(col_index)])
250+
sdc["cells"][name].append([int(row_index), int(col_index)])
251+
return sdc
251252

252253

253-
def get_summary(properties: dict) -> tuple[str, str]:
254+
def get_summary(sdc: dict) -> tuple[str, str]:
254255
"""Returns the status and summary of the suppression masks.
255256
256257
Parameters
257258
----------
258-
properties : dict
259+
sdc : dict
259260
Properties of the SDC checks.
260261
261262
Returns
@@ -267,22 +268,23 @@ def get_summary(properties: dict) -> tuple[str, str]:
267268
"""
268269
status: str = "pass"
269270
summary: str = ""
270-
sup: str = "suppressed" if properties["suppressed"] else "may need suppressing"
271-
if properties["negative"]:
271+
sdc_summary = sdc["summary"]
272+
sup: str = "suppressed" if sdc_summary["suppressed"] else "may need suppressing"
273+
if sdc_summary["negative"] > 0:
272274
summary += "negative values found"
273275
status = "review"
274-
elif properties["missing"]:
276+
elif sdc_summary["missing"] > 0:
275277
summary += "missing values found"
276278
status = "review"
277279
else:
278-
if properties["threshold"] > 0:
279-
summary += f"threshold: {properties['threshold']} cells {sup}; "
280+
if sdc_summary["threshold"] > 0:
281+
summary += f"threshold: {sdc_summary['threshold']} cells {sup}; "
280282
status = "fail"
281-
if properties["p-ratio"] > 0:
282-
summary += f"p-ratio: {properties['p-ratio']} cells {sup}; "
283+
if sdc_summary["p-ratio"] > 0:
284+
summary += f"p-ratio: {sdc_summary['p-ratio']} cells {sup}; "
283285
status = "fail"
284-
if properties["nk-rule"] > 0:
285-
summary += f"nk-rule: {properties['nk-rule']} cells {sup}; "
286+
if sdc_summary["nk-rule"] > 0:
287+
summary += f"nk-rule: {sdc_summary['nk-rule']} cells {sup}; "
286288
status = "fail"
287289
if summary != "":
288290
summary = f"{status}; {summary}"

acro/version.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"""ACRO version number."""
2+
__version__ = "0.4.0"

docs/source/conf.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77

88
sys.path.insert(0, os.path.abspath("../../"))
99

10+
from acro.version import __version__
11+
1012
# -- Project information -----------------------------------------------------
1113

1214
project = "ACRO"
1315
copyright = "2023, ACRO Project Team"
1416
author = "ACRO Project Team"
15-
release = "0.3.0"
17+
release = __version__
1618

1719
# -- General configuration ---------------------------------------------------
1820

0 commit comments

Comments
 (0)