-
Notifications
You must be signed in to change notification settings - Fork 0
/
benford.py
116 lines (96 loc) · 4.15 KB
/
benford.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Check conformance of numerical data to Benford's Law."""
import sys
import math
from collections import defaultdict
import matplotlib.pyplot as plt
# Benford's Law percentages for leading digits 1-9
BENFORD = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6]
def load_data(filename):
"""Open a text file & return a list of strings."""
with open(filename) as f:
return f.read().strip().split('\n')
def count_first_digits(data_list):
"""Count 1st digits in list of numbers; return counts & frequency."""
first_digits = defaultdict(int) # default value of int is 0
for sample in data_list:
if sample == '':
continue
try:
int(sample)
except ValueError as e:
print(e, file=sys.stderr)
print("Samples must be integers. Exiting.", file=sys.stderr)
sys.exit(1)
first_digits[sample[0]] += 1
# check for missing digits
keys = [str(digit) for digit in range(1, 10)]
for key in keys:
if key not in first_digits:
first_digits[key] = 0
data_count = [v for (k, v) in sorted(first_digits.items())]
total_count = sum(data_count)
data_pct = [(i / total_count) * 100 for i in data_count]
return data_count, data_pct, total_count
def get_expected_counts(total_count):
"""Return list of expected Benford's Law counts for total sample count."""
return [round(p * total_count / 100) for p in BENFORD]
def chi_square_test(data_count, expected_counts):
"""Return boolean on chi-square test (8 degrees of freedom & P-val=0.05)."""
chi_square_stat = 0 # chi square test statistic
for data, expected in zip(data_count, expected_counts):
chi_square = math.pow(data - expected, 2)
chi_square_stat += chi_square / expected
print("\nChi-squared Test Statistic = {:.3f}".format(chi_square_stat))
print("Critical value at a P-value of 0.05 is 15.51.")
return chi_square_stat < 15.51
def bar_chart(data_pct):
"""Make bar chart of observed vs expected 1st digit frequency in percent."""
fig, ax = plt.subplots()
index = [i + 1 for i in range(len(data_pct))] # 1st digits for x-axis
# text for labels, title and ticks
fig.canvas.set_window_title('Percentage First Digits')
ax.set_title('Data vs. Benford Values', fontsize=15)
ax.set_ylabel('Frequency (%)', fontsize=16)
ax.set_xticks(index)
ax.set_xticklabels(index, fontsize=14)
# build bars
rects = ax.bar(index, data_pct, width=0.95, color='black', label='Data')
# attach a text label above each bar displaying its height
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height,
'{:0.1f}'.format(height), ha='center', va='bottom',
fontsize=13)
# plot Benford values as red dots
ax.scatter(index, BENFORD, s=150, c='red', zorder=2, label='Benford')
# Hide the right and top spines & add legend
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend(prop={'size':15}, frameon=False)
plt.show()
def main():
"""Call functions and print stats."""
# load data
while True:
filename = input("\nName of file with COUNT data: ")
try:
data_list = load_data(filename)
except IOError as e:
print("{}. Try again.".format(e), file=sys.stderr)
else:
break
data_count, data_pct, total_count = count_first_digits(data_list)
expected_counts = get_expected_counts(total_count)
print("\nobserved counts = {}".format(data_count))
print("expected counts = {}".format(expected_counts), "\n")
print("First Digit Probabilities:")
for i in range(1, 10):
print("{}: observed: {:.3f} expected: {:.3f}".
format(i, data_pct[i - 1] / 100, BENFORD[i - 1] / 100))
if chi_square_test(data_count, expected_counts):
print("Observed distribution matches expected distribution.")
else:
print("Observed distribution does not match expected.", file=sys.stderr)
bar_chart(data_pct)
if __name__ == '__main__':
main()