-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextract.py
171 lines (139 loc) · 6.28 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import argparse
import json
import os
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import binascii as ba
import requests
from PIL import Image
import pytesseract
from dotenv import load_dotenv
import openai
from colorama import Fore, Style, init
init(autoreset=True)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def download_page(url, user_agent):
print(Fore.BLUE + "Downloading webpage content...")
headers = {'User-Agent': user_agent}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
lines = [line.strip() for line in soup.get_text(
separator='\n').split('\n') if line.strip()]
noemptylines = "\n".join(lines)
return noemptylines
def download_images(soup, base_url):
print(Fore.BLUE + "Downloading images...")
images = soup.find_all('img')
# This list will store the paths of downloaded images.
downloaded_image_paths = []
i = 0
for img in images:
i += 1
img_url = urljoin(base_url, img.get('src'))
if not img_url:
continue # Skip if img['src'] is None
if 'data:image/' in img_url: #embedded images
ext = img_url.partition('data:image/')[2].split(';')[0]
img_path = os.path.join(os.getcwd(), 'newim'+str(i)+'.'+ext)
with open(img_path,'wb') as f:
f.write(ba.a2b_base64(img_url.partition('base64,')[2]))
downloaded_image_paths.append(img_path)
else:
img_response = requests.get(img_url)
img_name = os.path.basename(img_url)
img_path = os.path.join(os.getcwd(), img_name)
with open(img_path, 'wb') as f:
f.write(img_response.content)
downloaded_image_paths.append(img_path) # Add the path to the list
return downloaded_image_paths # Return the list of downloaded image paths
def ocr_image(image_path):
print(Fore.BLUE + f"Performing OCR on {image_path}...")
return pytesseract.image_to_string(Image.open(image_path))
def parse_openai_response_to_iocs(text):
"""Parses the OpenAI text response into a list of IOC dictionaries."""
iocs = []
lines = [line for line in text.split('\n') if line.strip()]
for line in text.split("\n"):
parts = line.split(" : ")
if len(parts) == 3:
iocs.append({
"Indicator Type": parts[0].strip(),
"Indicator": parts[1].strip(),
"Context": parts[2].strip(),
})
# else:
# print(Fore.RED + "Response format incorrect, resubmitting request...")
# return None
return iocs
def extract_iocs_with_openai(content, context_identifier, retry_count=0, max_retries=1):
if retry_count > max_retries:
print(
Fore.RED + "Maximum retry limit reached. Moving on without additional retries.")
return []
system_prompt = "Return only as requested, without comments or code blocks, only as plain text. If nothing is found return: 'No IOCs found in the provided text'"
prompt = (
f"Extract all IOCs (IP addresses, domain names, email addresses, email subject, file name, useragent strings, urls, usernames, passwords, SHA and MD5 hashes, and so on) from the following text and format each as 'indicator type : indicator : context'. Context should be information that surrounds the IOC, such as the filename assocaited to the hash, the type of infrastructure that eixsts on an IPv4 address, and so on. {content}")
message = [{"role": "assistant", "content": system_prompt},
{"role": "user", "content": prompt}]
try:
response = openai.chat.completions.create(
model="gpt-4-turbo-preview",
messages=message,
temperature=0.3,
max_tokens=4096,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0
)
iocs_text = response.choices[0].message.content
print(iocs_text)
iocs = parse_openai_response_to_iocs(iocs_text)
if iocs is None:
print(
Fore.YELLOW + f"Retry {retry_count+1}/{max_retries} due to format mismatch...")
return extract_iocs_with_openai(content, context_identifier, retry_count+1, max_retries)
return iocs
except Exception as e:
print(Fore.RED + f"An error occurred while querying OpenAI: {e}")
return []
def cleanup_files(file_paths):
for path in file_paths:
try:
os.remove(path)
print(Fore.GREEN + f"Successfully removed {path}.")
except Exception as e:
print(Fore.RED + f"Error removing file {path}: {e}")
def main(url, output_file, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", retry_limit=1):
print(Fore.GREEN + "Starting IOC extraction process...")
if "file://" in url: #reading html content from local files
from pathlib import Path
page_content = Path(url.split("file://")[1]).read_text()
else:
page_content = download_page(url, user_agent)
soup = BeautifulSoup(page_content, 'html.parser')
text_content = soup.get_text()
downloaded_images = download_images(soup, url)
text_iocs = extract_iocs_with_openai(
text_content, "text_content", 0, retry_limit)
all_iocs = text_iocs[:]
for img_path in downloaded_images:
img_text = ocr_image(img_path)
img_iocs = extract_iocs_with_openai(img_text, img_path, 0, retry_limit)
all_iocs.extend(img_iocs)
cleanup_files(downloaded_images)
with open(output_file, 'w') as f:
json.dump(all_iocs, f, indent=4)
print(Fore.GREEN + "IOC extraction process completed.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Extract IOCs from a webpage.')
parser.add_argument('--url', required=True,
help='The URL of the webpage to analyze')
parser.add_argument('--output', required=True,
help='The JSON file to output')
parser.add_argument('--retry-limit', type=int, default=3,
help='The maximum number of retries for OpenAI requests')
args = parser.parse_args()
main(args.url, args.output)