-
Notifications
You must be signed in to change notification settings - Fork 394
/
Copy pathcodesizeprediction.py
185 lines (139 loc) · 6.33 KB
/
codesizeprediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/python3
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
# NOTE: This script is experimental. This script uses a linear regression to construct a model for predicting native
# code size from bytecode. Some initial work has been done to analyze a large corpus of Luau scripts, and while for
# most functions the model predicts the native code size quite well (+/-25%), there are many cases where the predicted
# size is off by as much as 13x. Notably, the predicted size is generally better for smaller functions and worse for
# larger functions. Therefore, in its current form this analysis is probably not suitable for use as a basis for
# compilation heuristics. A nonlinear model may produce better results. The script here exists as a foundation for
# further exploration.
import json
import glob
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import argparse
def readStats(statsFileGlob):
'''Reads files matching the supplied glob.
Files should be generated by the Compile.cpp CLI'''
statsFiles = glob.glob(statsFileGlob, recursive=True)
print("Reading %s files." % len(statsFiles))
df_dict = {
"statsFile": [],
"script": [],
"name": [],
"line": [],
"bcodeCount": [],
"irCount": [],
"asmCount": [],
"bytecodeSummary": []
}
for statsFile in statsFiles:
stats = json.loads(Path(statsFile).read_text())
for script, filestats in stats.items():
for funstats in filestats["lowerStats"]["functions"]:
df_dict["statsFile"].append(statsFile)
df_dict["script"].append(script)
df_dict["name"].append(funstats["name"])
df_dict["line"].append(funstats["line"])
df_dict["bcodeCount"].append(funstats["bcodeCount"])
df_dict["irCount"].append(funstats["irCount"])
df_dict["asmCount"].append(funstats["asmCount"])
df_dict["bytecodeSummary"].append(
tuple(funstats["bytecodeSummary"][0]))
return pd.DataFrame.from_dict(df_dict)
def addFunctionCount(df):
df2 = df.drop_duplicates(subset=['asmCount', 'bytecodeSummary'], ignore_index=True).groupby(
['bytecodeSummary']).size().reset_index(name='functionCount')
return df.merge(df2, on='bytecodeSummary', how='left')
# def deduplicateDf(df):
# return df.drop_duplicates(subset=['bcodeCount', 'asmCount', 'bytecodeSummary'], ignore_index=True)
def randomizeDf(df):
return df.sample(frac=1)
def splitSeq(seq):
n = len(seq) // 2
return (seq[:n], seq[n:])
def trainAsmSizePredictor(df):
XTrain, XValidate = splitSeq(
np.array([list(seq) for seq in df.bytecodeSummary]))
YTrain, YValidate = splitSeq(np.array(df.asmCount))
reg = LinearRegression(
positive=True, fit_intercept=False).fit(XTrain, YTrain)
YPredict1 = reg.predict(XTrain)
YPredict2 = reg.predict(XValidate)
trainRmse = np.sqrt(np.mean((np.array(YPredict1) - np.array(YTrain))**2))
predictRmse = np.sqrt(
np.mean((np.array(YPredict2) - np.array(YValidate))**2))
print(f"Score: {reg.score(XTrain, YTrain)}")
print(f"Training RMSE: {trainRmse}")
print(f"Prediction RMSE: {predictRmse}")
print(f"Model Intercept: {reg.intercept_}")
print(f"Model Coefficients:\n{reg.coef_}")
df.loc[:, 'asmCountPredicted'] = np.concatenate(
(YPredict1, YPredict2)).round().astype(int)
df['usedForTraining'] = np.concatenate(
(np.repeat(True, YPredict1.size), np.repeat(False, YPredict2.size)))
df['diff'] = df['asmCountPredicted'] - df['asmCount']
df['diffPerc'] = (100 * df['diff']) / df['asmCount']
df.loc[(df["diffPerc"] == np.inf), 'diffPerc'] = 0.0
df['diffPerc'] = df['diffPerc'].round()
return (reg, df)
def saveModel(reg, file):
f = open(file, "w")
f.write(f"Intercept: {reg.intercept_}\n")
f.write(f"Coefficients: \n{reg.coef_}\n")
f.close()
def bcodeVsAsmPlot(df, plotFile=None, minBcodeCount=None, maxBcodeCount=None):
if minBcodeCount is None:
minBcodeCount = df.bcodeCount.min()
if maxBcodeCount is None:
maxBcodeCount = df.bcodeCount.max()
subDf = df[(df.bcodeCount <= maxBcodeCount) &
(df.bcodeCount >= minBcodeCount)]
plt.scatter(subDf.bcodeCount, subDf.asmCount)
plt.title("ASM variation by Bytecode")
plt.xlabel("Bytecode Instruction Count")
plt.ylabel("ASM Instruction Count")
if plotFile is not None:
plt.savefig(plotFile)
return plt
def predictionErrorPlot(df, plotFile=None, minPerc=None, maxPerc=None, bins=200):
if minPerc is None:
minPerc = df['diffPerc'].min()
if maxPerc is None:
maxPerc = df['diffPerc'].max()
plotDf = df[(df["usedForTraining"] == False) & (
df["diffPerc"] >= minPerc) & (df["diffPerc"] <= maxPerc)]
plt.hist(plotDf["diffPerc"], bins=bins)
plt.title("Prediction Error Distribution")
plt.xlabel("Prediction Error %")
plt.ylabel("Function Count")
if plotFile is not None:
plt.savefig(plotFile)
return plt
def parseArgs():
parser = argparse.ArgumentParser(
prog='codesizeprediction.py',
description='Constructs a linear regression model to predict native instruction count from bytecode opcode distribution')
parser.add_argument("fileglob",
help="glob pattern for stats files to be used for training")
parser.add_argument("modelfile",
help="text file to save model details")
parser.add_argument("--nativesizefig",
help="path for saving the plot showing the variation of native code size with bytecode")
parser.add_argument("--predictionerrorfig",
help="path for saving the plot showing the distribution of prediction error")
return parser.parse_args()
if __name__ == "__main__":
args = parseArgs()
df0 = readStats(args.fileglob)
df1 = addFunctionCount(df0)
df2 = randomizeDf(df1)
plt = bcodeVsAsmPlot(df2, args.nativesizefig, 0, 100)
plt.show()
(reg, df4) = trainAsmSizePredictor(df2)
saveModel(reg, args.modelfile)
plt = predictionErrorPlot(df4, args.predictionerrorfig, -200, 200)
plt.show()