-
Notifications
You must be signed in to change notification settings - Fork 6
/
maldozer.py
128 lines (101 loc) · 3.9 KB
/
maldozer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import print_function
import numpy as np
import re
import glob
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras import layers
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
import os
from keras.models import model_from_json
EMBED_HIDDEN_SIZE = 64
NO_OF_CONV_FILTERS = 256
NO_OF_CONV_SIZE = 3
API_SEQUENCE_MAX_LEN = 600
NUMBER_OF_API_CALLS = 6
NUMBER_OF_TRAINING_EXAMPLES_PER_BATCH = 4
all_api_calls_file = open('mixed_dataset/all_api_calls.txt')
all_api_calls = []
for lines in all_api_calls_file.readlines():
all_api_calls.append(lines[:-1])
api_index = dict((c,i+1) for i,c in enumerate(all_api_calls))
print('API Index')
#print(api_index)
all_training_samples = glob.glob('mixed_dataset/*.out')
#all_training_samples = all_training_samples[:NUMBER_OF_TRAINING_EXAMPLES_PER_BATCH]
#print(all_training_samples)
#print(len(all_training_samples))
processed_training_examples = []
processing_count = 0
for i in all_training_samples:
command = 'find '+i+' -name "*.smali" -exec cat {} \; | grep /*Manager | grep ";->" | grep \(.*\) | grep -v "Layout" | sort | uniq > feature_extractor.txt'
os.system(command)
xlist = []
processed_training_examples_files = open('feature_extractor.txt','r')
for lines in processed_training_examples_files.readlines():
xlist.append(lines[:-1])
processed_training_examples.append(xlist)
processed_training_examples_files.close()
print('processing count'+str(processing_count))
processing_count = processing_count + 1
#print('Processed Training Examples')
print(processed_training_examples)
print(len(processed_training_examples))
processing_count = 0
processed_and_indexed_training_examples = []
for i in processed_training_examples:
xlist = []
for j in i :
if j in all_api_calls:
xlist.append(api_index[j])
else:
xlist.append(0)
#print(len(xlist))
if len(xlist) >= API_SEQUENCE_MAX_LEN:
xlist = xlist[:API_SEQUENCE_MAX_LEN]
else:
for i in range(API_SEQUENCE_MAX_LEN-len(xlist)):
xlist.append(0)
processed_and_indexed_training_examples.append(xlist)
print('Indexing Count'+str(processing_count))
processing_count = processing_count + 1
#print('Processed And Indexed Training Examples')
print(processed_and_indexed_training_examples)
processed_and_indexed_training_examples = np.array(processed_and_indexed_training_examples)
#benign - 1,0
#malware - 0,1
outputs=[[0,1],[0,1],[0,1],[0,1]]
outputs = np.array(outputs)
'''
API_sequence = layers.Input(shape=(API_SEQUENCE_MAX_LEN,), dtype='int32')
embeded_sequence = layers.Embedding(len(all_api_calls), EMBED_HIDDEN_SIZE)(API_sequence)
conv_layer = layers.Conv1D(NO_OF_CONV_FILTERS,NO_OF_CONV_SIZE)(embeded_sequence)
activ_layer = layers.Activation('relu')(conv_layer)
global_max_pool = layers.GlobalMaxPooling1D()(activ_layer)
fc_layer = layers.Dense(256)(global_max_pool)
fc_relu = layers.Activation('relu')(fc_layer)
dropout_layer = layers.Dropout(0.5)(fc_relu)
final_layer = layers.Dense(2)(dropout_layer)
final_softmax_layer = layers.Activation('softmax')(final_layer)
model = Model(API_sequence,final_softmax_layer)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(processed_and_indexed_training_examples,outputs,epochs = 100)
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
'''
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(loaded_model.predict(processed_and_indexed_training_examples))