-
Notifications
You must be signed in to change notification settings - Fork 0
/
diagnostics.py
127 lines (87 loc) · 3.57 KB
/
diagnostics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
'''
This file performs diagnostics on the data, models, and dependencies used in the project.
Author: Gian Atmaja
Date Created: 19 May 2023
'''
# Import required libraries
import pandas as pd
import numpy as np
import timeit
import os
import json
import pickle
import subprocess
import sys
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset
# Load config.json and get environment variables
with open('config.json','r') as f:
config = json.load(f)
dataset_csv_path = os.path.join(config['output_folder_path'])
test_data_path = os.path.join(config['test_data_path'])
prod_deployment_path = os.path.join(config['prod_deployment_path'])
# Function to get model predictions
def model_predictions(df):
#read the deployed model and a test dataset, calculate predictions
filename = prod_deployment_path + "/" + "trainedmodel.pkl"
with open(filename, 'rb') as pickle_file:
model = pickle.load(pickle_file)
X = df[['lastmonth_activity', 'lastyear_activity', 'number_of_employees']]
y = df['exited']
# Predict on test X features
y_preds = model.predict(X)
return y_preds
# Function to get summary statistics
def dataframe_summary():
# Read in data and select columns
df = pd.read_csv("{}/finaldata.csv".format(dataset_csv_path))
df = df[['lastmonth_activity', 'lastyear_activity', 'number_of_employees']]
# Get summary
col_stats = []
for column in df.columns:
col_stats.append([column + " (mean):", df[column].mean()])
col_stats.append([column + " (median):", df[column].median()])
col_stats.append(
[column + " (standard deviation):", df[column].std()])
return col_stats
# Function to obtain missing data proportion
def missing_data():
# Read in data
df = pd.read_csv("{}/finaldata.csv".format(dataset_csv_path))
# Calculate proportion of null values
na_prop = []
for column in df.columns:
na_prop.append(
[column + " (%):", int(df[column].isna().sum() / df[column].shape[0] * 100)])
return na_prop
# Function to get timings of data ingestion and model training scripts
def execution_time():
# Select scripts to time
scripts_to_time = ['ingestion.py', 'training.py']
time_records = []
#calculate timing of ingestion.py and training.py
for script in scripts_to_time:
starttime = timeit.default_timer()
_ = subprocess.run(['python', script], capture_output=True)
timing = timeit.default_timer() - starttime
time_records.append([script + ": ", timing])
return time_records
# Function to check dependencies
def outdated_packages_list():
# Get list of packages, used and current versions
outdated_packages = subprocess.check_output(
['pip', 'list', '--outdated']).decode(sys.stdout.encoding)
return outdated_packages
# Function to check for data drift
def get_data_drift_report(df_ref, df_new):
# Get drift report, generated by EvidentlyAI
drift_report = Report(metrics=[DataDriftPreset(), TargetDriftPreset()])
drift_report.run(reference_data = df_ref, current_data = df_new)
return drift_report
if __name__ == '__main__':
df_test = pd.read_csv('{}/testdata.csv'.format(test_data_path))
print(model_predictions(df_test), '\n')
print(dataframe_summary(), '\n')
print(missing_data(), '\n')
print(execution_time(), '\n')
print(outdated_packages_list())