-
Notifications
You must be signed in to change notification settings - Fork 2
/
DataSynthetiqueGenerator.py
139 lines (109 loc) · 3.77 KB
/
DataSynthetiqueGenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Intialisation.
import pandas as pd
import random
na = 6 # Number of attribut
no = 1000 # Number of object
ns = 10 # Number of source
attributs = ['Property'+str(x) for x in range(1,na+1)]
objects = ['Object'+str(x) for x in range(1,no+1)]
sources = ['Source'+str(x) for x in range(1,ns+1)]
# Set of paramete (the value wil be between 0 and 1) : description un the papar
m1 = 1.0
m2 = 0.0
m3 = 0.8
# Floder to save the data after generated.
floder = ""
# Definistion des valeurs vraix (GT)
def get_truth(objects,attributs):
data = {
'Object':[],
'Property':[],
'Value':[],
'False':[]
}
for o in objects:
for a in attributs:
data['Object'].append(o)
data['Property'].append(a)
data['Value'].append(random.choice(range(100000,499999)))
data['False'].append(random.sample(range(500000,999999),k=20))
truth = pd.DataFrame(data)
return truth
data_truth = get_truth(objects,attributs)
# Random selection of partition of P.
def partition (list_in, n):
random.shuffle(list_in)
return [list_in[i::n] for i in range(n)]
n = random.choice(range(2,len(attributs)))
partition = partition(attributs,n)
# For each subset in P, we randomly choose a source from S which is deemed to be highly accurate on this subset
sources_ = {
}
for i in range(len(partition)):
sources_[i]=[sources[i]]
l=0
for u in range(len(partition),len(sources)):
sources_[l].append(sources[u])
l+=1
if l == len(partition):
l=0
# For every subset X1 in P together with the corresponding chosen source s in S 0 ,
# we uniformly set using our distribution functions U1 and U2
indice_partition = 0
dataframe = pd.DataFrame({
'ID':[],
'Object':[],
'Property':[],
'Value':[],
'Source':[]
})
ID = 0
for x1 in partition:
data = {
'ID':[],
'Object':[],
'Property':[],
'Value':[],
'Source':[]
}
indice_sources = 0
for sources in sources_.values():
accuracy = 0
accuracy_ = 0
if indice_sources==indice_partition:
accuracy = random.uniform(m1,1)
accuracy_ = accuracy
else:
accuracy = random.uniform(0,m2)
accuracy_ = accuracy
for source_index in range(len(sources)):
if source_index!=0:
accuracy = random.uniform(accuracy_-0.5,accuracy_)
nunber_cov = random.uniform(m3,1)*len(x1)*len(objects)*accuracy
else:
nunber_cov = len(x1)*len(objects)*accuracy_
nunber_ = 0
for o in objects:
for a in x1:
value = 0
value_truth = data_truth[(data_truth['Object'] == o) & (data_truth['Property'] == a)].values[0][2]
if nunber_ < nunber_cov:
value = value_truth
else:
while True:
value = random.choice(data_truth[(data_truth['Object'] == o) & (data_truth['Property'] == a)].values[0][3])
if value!=value_truth:
break
data['ID'].append(ID)
data['Object'].append(o)
data['Property'].append(a)
data['Value'].append(int(value))
data['Source'].append(sources[source_index])
nunber_+=1
indice_sources+=1
#print(tell_truths)
indice_partition+=1
dataframe = pd.concat([dataframe,pd.DataFrame(data)],axis=0)
dataframe['Value']=dataframe.Value.astype('int')
dataframe.to_csv(floder+'data.csv',index=False)
data_truth.to_csv(floder+'data_truth.csv',index=False)