-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_extraction.py
58 lines (51 loc) · 2.57 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pefile
import pandas as pd
import math
# Function to calculate entropy of a section
def calculate_entropy(data):
if not data:
return 0
entropy = 0
for x in range(256):
p_x = float(data.count(bytes([x]))) / len(data)
if p_x > 0:
entropy += - p_x * math.log(p_x, 2)
return entropy
def extract_features(file_path):
pe = pefile.PE(file_path)
# Extract the specified 23 features in the given order
features = {
'MajorLinkerVersion': pe.OPTIONAL_HEADER.MajorLinkerVersion,
'MinorOperatingSystemVersion': pe.OPTIONAL_HEADER.MinorOperatingSystemVersion,
'MajorSubsystemVersion': pe.OPTIONAL_HEADER.MajorSubsystemVersion,
'SizeOfStackReserve': pe.OPTIONAL_HEADER.SizeOfStackReserve,
'TimeDateStamp': pe.FILE_HEADER.TimeDateStamp,
'MajorOperatingSystemVersion': pe.OPTIONAL_HEADER.MajorOperatingSystemVersion,
'Characteristics': pe.FILE_HEADER.Characteristics,
'ImageBase': pe.OPTIONAL_HEADER.ImageBase,
'Subsystem': pe.OPTIONAL_HEADER.Subsystem,
'MinorImageVersion': pe.OPTIONAL_HEADER.MinorImageVersion,
'MinorSubsystemVersion': pe.OPTIONAL_HEADER.MinorSubsystemVersion,
'SizeOfInitializedData': pe.OPTIONAL_HEADER.SizeOfInitializedData,
'DllCharacteristics': pe.OPTIONAL_HEADER.DllCharacteristics,
'DirectoryEntryExport': 1 if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT') else 0,
'ImageDirectoryEntryExport': pe.OPTIONAL_HEADER.DATA_DIRECTORY[0].Size if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT') else 0,
'CheckSum': pe.OPTIONAL_HEADER.CheckSum,
'DirectoryEntryImportSize': pe.OPTIONAL_HEADER.DATA_DIRECTORY[1].Size if hasattr(pe, 'DIRECTORY_ENTRY_IMPORT') else 0,
'SectionMaxChar': len(pe.sections), # Example calculation for demonstration
'MajorImageVersion': pe.OPTIONAL_HEADER.MajorImageVersion,
'AddressOfEntryPoint': pe.OPTIONAL_HEADER.AddressOfEntryPoint,
'SectionMinEntropy': None, # Placeholder, will be calculated
'SizeOfHeaders': pe.OPTIONAL_HEADER.SizeOfHeaders,
'SectionMinVirtualsize': None # Placeholder, will be calculated
}
# Calculate SectionMinEntropy
entropies = []
for section in pe.sections:
entropy = calculate_entropy(section.get_data())
entropies.append(entropy)
if entropies:
features['SectionMinEntropy'] = min(entropies)
# Calculate SectionMinVirtualsize (example calculation)
features['SectionMinVirtualsize'] = min(section.Misc_VirtualSize for section in pe.sections)
return pd.DataFrame([features])