-
Notifications
You must be signed in to change notification settings - Fork 0
/
statistics.py
97 lines (71 loc) · 2.74 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import psycopg2
import argparse
import json
import ast
from parser import parser
def create_statistics(sql_dir):
""" Connect to the PostgreSQL database server """
conn = None
try:
# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(database = "imdbload")
# create a cursor
cur = conn.cursor()
sql_files = os.listdir(sql_dir)
# read all queries and create a set of all attributes used in join and selection predicates
all_attributes = set()
for sql_file in sql_files:
with open(os.path.join(sql_dir, sql_file), 'r') as f:
sql = f.read()
node = parser.parse(sql)
#join_predicates = [p.to_sql() for p in ast.get_joins(node)]
#for join in join_predicates:
# for attribute in join.split('='):
# all_attributes.add(attribute)
selection_predicates= [p.left.to_sql() for p in ast.get_selections(node)]
for predicate in selection_predicates:
all_attributes.add(predicate);
print(len(all_attributes))
all_attributes = sorted(all_attributes)
buckets = 10
n = 3
# for each predicate attribute create the equi depth histogram by invoking the database
statistics = {name: {} for name in all_attributes}
for attribute in all_attributes:
attributeTuple = attribute.split('.')
relation = attributeTuple[0]
attr = attributeTuple[1]
# return the statistics on _relation_._attr_
sqlMCVTemplate = 'ALTER TABLE _relation_ ALTER _attr_ SET STATISTICS _buckets_;\
ANALYZE _relation_;\
SELECT most_common_vals, histogram_bounds FROM pg_stats\
WHERE tablename=\'_relation_\' AND attname=\'_attr_\';';
sql = sqlMCVTemplate.replace("_attr_", attr).replace("_relation_", relation).replace("_buckets_", str(buckets))
# create the mcv
cur.execute(sql)
# store the mcv
for value in cur:
if(value[1] != None):
statistics[attribute]['histogram_bounds'] = value[1].replace('{','').replace('}','').split(',');
else:
statistics[attribute]['histogram_bounds'] = [value[1]]
if(value[0] != None):
statistics[attribute]['most_common_values'] = value[0].replace('{','').replace('}','').split(',')[0:n];
else:
statistics[attribute]['most_common_values'] = [value[0]]
# close the communication with the PostgreSQL
cur.close()
f = open('statistics.json', 'w', encoding='utf-8')
f.write(json.dumps(statistics, ensure_ascii=False, indent=4, sort_keys=True))
except (Exception, psycopg2.DatabaseError) as error:
print(error)
finally:
if conn is not None:
conn.close()
print('Database connection closed.')
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument('sql_dir')
args = argument_parser.parse_args()
create_statistics(args.sql_dir)