-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSample.py
117 lines (103 loc) · 3.57 KB
/
Sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from __future__ import print_function, division
# __author__ = 'WeiFu'
import sys, pdb, random, math
from utility import *
from experiment import *
import matplotlib.pyplot as plt
import numpy as np
def space(p, columns, N, num_of_bins):
def ps(myspace):
prob = []
for col in myspace:
for bin in col:
temp_p = 1 - sum([i[1] for i in bin]) / len(bin) # prob of defective instances
prob.append(temp_p)
all = sum(prob)
f2 = [x / all for x in prob]
return [p * 1 / num_of_bins * x for x in f2]
last, dist, out = 0, int(math.ceil(len(columns[0]) / N)), []
cut = [(j + 1) * dist for j in range(N) if (j + 1) * dist < len(columns[0])]
cut.extend([len(columns[0])])
space = []
for col in columns:
bins = []
last = 0
for c in cut:
bins.append(col[last:c])
last = c
space.append(bins)
return ps(space)
def test(w):
n = found = 0
points = {}
while found < 0.90:
n += 10
found = 1 - (1 - w) ** n
points[str(n)] = points.get(str(n), []) + [round(found,3)]
return points
def chops(match , source_src, selected_attr=[], N=3):
arff = loadWekaData(source_src)
attributes = [str(i)[str(i).find("@attribute") + len("@attribute") + 1:str(i).find("numeric") - 1] for i in
enumerateToList(arff.enumerateAttributes())] # exclude the label
attributes_index = [i for i, attr in enumerate(attributes) if attr in selected_attr]
columns = [sorted(zip(arff.attributeToDoubleArray(i), arff.attributeToDoubleArray(arff.classIndex())), reverse=True)
for i in attributes_index] # exclude the class label
num_of_bins = N ** len(attributes_index)
p = numBuggyInstance(arff) / arff.size()
w = sum(space(p, columns, N, num_of_bins))
distribution = test(w)
return distribution
def run():
source_target_match = readMatch("./result/Sim3_source_target_match0727.txt")
original_src = "./dataset"
datasrc = readsrc(original_src)
for group, srclst in datasrc.iteritems():
for target_src in srclst:
target_name = target_src[target_src.rindex("/") + 1:]
print("target:", target_name,"*"*10)
for i in source_target_match:
if i.target_name == target_name:
result ={"method":["N"]}
result['method'] = result.get('method') +[i.source_src[i.source_src.rindex("/")+1:]]
for key,val in chops([],i.source_src,i.attr_source).iteritems():
result[key] = result.get(key,[key])+val
printout(result)
pdb.set_trace()
def plot(result):
# color = ['r-','k-','b-','b^','g-','y-','c-','m-']
# labels = ['WHICH','Tuned_WHICH','manualUp','manualDown','minimum','best','Tuned_CART','CART']
color = ['r-', 'k-', 'b-', 'g-', 'y-', 'c-', 'm-']
labels = ['Prob', 'manualUp', 'manualDown', 'minimum', 'best', 'CART', 'C4.5']
plt.figure(1)
x = result
plt.plot(x[0], x[1], color[0], label=labels[0])
plt.xlabel("N(sample size)")
plt.ylabel("Prob")
# plt.title("Effort-vs-PD")
plt.ylim(0, 1)
plt.legend(loc='best')
plt.show()
def distribution():
source_target_match = readMatch("./result/Large_Small_match.txt")
original_src = "./dataset"
datasrc = readsrc(original_src)
result = {}
for i in source_target_match:
result[len(i.attr_source)]=result.get(len(i.attr_source),0) +1
for key,val in result.iteritems():
print("dim="+str(key),":",str(val))
def test():
w = 0.003
out = []
index = []
for i in xrange(0,1000,10):
p = 1-(1-w)**i
out.append(p)
index.append(i)
pdb.set_trace()
plot([np.array(index),np.array(out)])
print(out)
if __name__ == "__main__":
# run()
# test()
distribution()