-
Notifications
You must be signed in to change notification settings - Fork 6
/
preprocessing.py
141 lines (123 loc) · 4.57 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# recursive function that goes through each json file
def fill():
global edge_list
global tensor_list
# current location in json string
global location
# total nodes found
global total
# root node
global input_1
# grab root token
item = input_1[location]
# get int value of token
index = name_to_int["'"+item+"'"]
# get tensor for the value
tensor_to_add = vector_tensor.data[index]
g.add_nodes(1)
tensor_list.append(tensor_to_add)
# where current node is stored
current_node = total
total += 1
location += 1
# go to next index
item = input_1[location]
# if child
if item == '{':
# keep moving
location += 1
# connect the child to the parent
edge_list.append((total, current_node))
# insert child into graph
fill()
# get next term
item = input_1[location]
# if second child
if item == '{':
location += 1
edge_list.append((total, current_node))
fill()
item = input_1[location]
# if closing
if item == '}':
location += 1
return
data = pd.read_json('preprocessed_progs_test.json')
# vocabulary
i = set()
# go through target tree and grab each unique token
for item in data["target_ast"]:
# get rid of anything that is not a token or root
x = str(item).replace("[","").replace("]","").replace("{","").replace("}","").replace("''","").replace(":","").replace(",","").split()
for num,h in enumerate(x, start = 0):
if(h == "'root'"):
i.add(x[num + 1])
# go through source tree and grab each unique token
for item in data["source_ast"]:
x = str(item).replace("[","").replace("\\n","").replace("]","").replace("{","").replace("}","").replace("''","").replace(":","").replace(",","").split()
for num,h in enumerate(x, start = 0):
if(h == "'root'"):
i.add(x[num + 1])
# map each token to an int
name_to_int = dict((name, number) for number, name in enumerate(i))
idx = sorted(name_to_int.values())
# vectorize token list
vectors = np.zeros((len(idx),max(idx) + 1))
vectors[np.arange(len(vectors)),idx] = 1
# convert from numpy array to tensor
vector_tensor = th.from_numpy(vectors)
# initialize lists holding graphs
target_container = []
source_container = []
# make the target graph list
for num, item in enumerate(data["target_ast"], start = 0):
edge_list = []
tensor_list = []
location = 1
total = 0
# start graph
g = dgl.DGLGraph()
# parse through current target ast
input_1 = str(data["target_ast"][num]).replace("\\n","").replace("root","").replace("children","").replace("[","").replace("]","").replace("'","").replace(":","").replace(",","").replace("{","{ ").replace("}"," }").split()
# build the graph
fill()
# make tuples for the edges and save in source and destination
src, dst = tuple(zip(*edge_list))
# make edges between nodes
g.add_edges(src, dst)
#g.add_edges(dst, src)
# attach tensor values to nodes
g.ndata['info'] = th.randn(total, len(name_to_int))
# for every node in the tree
for num, item in enumerate(tensor_list, start = 0):
# attach appropriate tensor
g.ndata['info'][num] = tensor_list[num]
# add to the list
target_container.append(g)
# make the source graph list
for num,item in enumerate(data["source_ast"], start = 0):
edge_list = []
tensor_list = []
location = 1
total = 0
# start graph
g = dgl.DGLGraph()
# parse through current source ast
input_1 = str(data["source_ast"][num]).replace("\\n","").replace("root","").replace("children","").replace("[","").replace("]","").replace("'","").replace(":","").replace(",","").replace("{","{ ").replace("}"," }").split()
# build the graph
fill()
# make tuples for the edges and save in source and destination
src, dst = tuple(zip(*edge_list))
# make edges between nodes
g.add_edges(src, dst)
#g.add_edges(dst, src)
# attach tensor values to nodes
g.ndata['info'] = th.randn(total,len(name_to_int))
# for every node in the tree
for num, item in enumerate(tensor_list, start = 0):
# attach appropriate tensor
g.ndata['info'][num] = tensor_list[num]
# add to the list
source_container.append(g)
#print example graph
nx.draw_kamada_kawai(target_container[78].to_networkx(), with_labels=True)