-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# # Use an official lightweight Python image as a base | ||
# FROM python:3.9-slim | ||
|
||
# # Set environment variables | ||
# ENV TOR_PATH="/opt/tor-browser_en-US/" | ||
|
||
# # Install necessary packages | ||
# RUN apt-get update && apt-get install -y \ | ||
# tor \ | ||
# wget \ | ||
# xvfb \ | ||
# x11-utils \ | ||
# xdg-utils \ | ||
# gnupg \ | ||
# curl \ | ||
# unzip \ | ||
# && apt-get clean | ||
|
||
# # Add MongoDB repository and install the MongoDB shell (`mongosh`) | ||
# RUN curl -fsSL https://www.mongodb.org/static/pgp/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-keyring.gpg \ | ||
# && echo "deb [signed-by=/usr/share/keyrings/mongodb-keyring.gpg] https://repo.mongodb.org/apt/debian bullseye/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \ | ||
# && apt-get update \ | ||
# && apt-get install -y mongodb-org-shell | ||
|
||
# # Install Python dependencies | ||
# COPY requirements.txt . | ||
# RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# # Create Tor Browser directory | ||
# RUN mkdir -p $TOR_PATH | ||
|
||
# RUN apt-get install -y xz-utils file | ||
|
||
# # Dynamically fetch the latest Tor Browser version | ||
# RUN export TOR_VERSION=$(curl -fsSL https://dist.torproject.org/torbrowser/ | grep -o 'tor-browser-linux64-[0-9.]\+_en-US.tar.xz' | head -n 1) && \ | ||
# echo "Downloading Tor Browser version: $TOR_VERSION" && \ | ||
# curl -fsSL -o /tmp/tor.tar.xz "https://dist.torproject.org/torbrowser/$TOR_VERSION" || \ | ||
# curl -fsSL -o /tmp/tor.tar.xz "https://mirror.torproject.org/torbrowser/$TOR_VERSION" | ||
|
||
# # ✅ Verify the file before extracting | ||
# RUN file /tmp/tor.tar.xz && \ | ||
# tar -xf /tmp/tor.tar.xz -C /opt/ && \ | ||
# rm /tmp/tor.tar.xz | ||
|
||
# # Copy project files | ||
# WORKDIR /app | ||
# COPY . . | ||
|
||
# # Expose MongoDB default port | ||
# EXPOSE 27017 | ||
|
||
# # Start the Tor service and run the crawler | ||
# CMD ["bash", "-c", "tor & sleep 5 && python main.py"] | ||
|
||
|
||
FROM ubuntu:20.04 | ||
|
||
RUN apt update && apt install -y python3.8 python3-pip wget | ||
RUN pip3 install tbselenium selenium pymongo | ||
|
||
RUN wget https://www.torproject.org/dist/torbrowser/14.0.4/tor-browser-linux-x86_64-14.0.4.tar.xz | ||
RUN tar -xf tor-browser-linux-x86_64-14.0.4.tar.xz | ||
|
||
WORKDIR /app | ||
COPY . . | ||
|
||
EXPOSE 27017 | ||
|
||
# tor path | ||
# /tor-browser/Browser/firefox | ||
|
||
# ENTRYPOINT [ "python3 /app/main.py" ] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from hashlib import sha256 | ||
|
||
class BitcoinAddressExtractor: | ||
digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' | ||
|
||
@staticmethod | ||
def decode_base58(bc, length): | ||
n = 0 | ||
for char in bc: | ||
n = n * 58 + BitcoinAddressExtractor.digits58.index(char) | ||
return n.to_bytes(length, 'big') | ||
|
||
@staticmethod | ||
def check_bc(bc): | ||
''' | ||
Check if the input text is a valid bitcoin or not | ||
''' | ||
try: | ||
bcbytes = BitcoinAddressExtractor.decode_base58(bc, 25) | ||
# Verify checksum | ||
return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4] | ||
except Exception: | ||
return False | ||
|
||
@staticmethod | ||
def extract_addresses(text): | ||
addresses = [] | ||
i = 0 | ||
while i <= len(text) - 26: | ||
# Check if the starting character is '1' or '3' | ||
if text[i] == '1' or text[i] == '3': | ||
# Try lengths from 26 to 35 (Bitcoin address range) | ||
for win_len in range(26, 36): | ||
potential_address = text[i:i+win_len] | ||
if BitcoinAddressExtractor.check_bc(potential_address): | ||
addresses.append(potential_address) | ||
break | ||
i += 1 | ||
return addresses |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Update based on your system | ||
CRAWL_LIMIT = 10000 | ||
TOR_PATH = "/opt/tor-browser_en-US/" # path to tor browser | ||
DB_NAME = "crawlerData1" | ||
URL = 'http://thehiddenwiki.org/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# crawler.py | ||
import re | ||
from selenium import webdriver | ||
from selenium.webdriver.firefox.options import Options | ||
from selenium.common.exceptions import NoSuchElementException | ||
from bitcoin_address_extractor import BitcoinAddressExtractor | ||
from config import CRAWL_LIMIT | ||
|
||
class Crawler: | ||
def __init__(self, tor_path, db): | ||
# self.driver = TorBrowserDriver(tor_path) | ||
# self.db = db | ||
options = Options() | ||
options.headless = True # Run in headless mode (no GUI) | ||
|
||
# Configure Firefox to use Tor's SOCKS5 proxy | ||
options.set_preference("network.proxy.type", 1) | ||
options.set_preference("network.proxy.socks", "127.0.0.1") | ||
options.set_preference("network.proxy.socks_port", 9050) | ||
options.set_preference("network.proxy.socks_remote_dns", True) | ||
|
||
# Start a headless Firefox browser | ||
self.driver = webdriver.Firefox(options=options) | ||
|
||
def crawl(self, url): | ||
for _ in range(CRAWL_LIMIT): | ||
try: | ||
self.driver.get(url) | ||
all_links = self.get_all_links() | ||
text = self.get_page_text() | ||
addresses = BitcoinAddressExtractor.extract_addresses(text) | ||
|
||
if addresses: | ||
metadata = self.get_metadata() | ||
self.db.insert_page_data(url, metadata, addresses) | ||
|
||
self.db.update_link_visited(url) | ||
url = self.db.get_next_link() | ||
|
||
except Exception as e: | ||
print(f"Error while processing {url}: {e}") | ||
self.db.update_link_visited(url) | ||
url = self.db.get_next_link() | ||
|
||
def get_all_links(self): | ||
all_links = [] | ||
elems = self.driver.find_elements_by_xpath("//a[@href]") | ||
for elem in elems: | ||
link = elem.get_attribute("href") | ||
match = re.search('.onion', link) | ||
if match: | ||
all_links.append(link) | ||
return all_links | ||
|
||
def get_page_text(self): | ||
return self.driver.find_element_by_xpath("/html/body").text | ||
|
||
def get_metadata(self): | ||
try: | ||
return self.driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content") | ||
except NoSuchElementException: | ||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from pymongo import MongoClient | ||
|
||
class Database: | ||
def __init__(self, db_name): | ||
self.client = MongoClient('localhost', 27017) | ||
self.db = self.client[db_name] | ||
self.table1 = self.db['websites'] | ||
self.table2 = self.db['crawl_links'] | ||
|
||
def insert_link(self, link, visited=0): | ||
self.table2.insert_one({"link": link, "visited": visited}) | ||
|
||
def get_next_link(self): | ||
query = {"visited": 0} | ||
return self.table2.find_one(query)['link'] | ||
|
||
def update_link_visited(self, link): | ||
query = {"link": link} | ||
new_value = {"$set": {"visited": 1}} | ||
self.table2.update_one(query, new_value) | ||
|
||
def insert_page_data(self, url, metadata, addresses): | ||
record = { | ||
"link": url, | ||
"metadata": metadata, | ||
"bitcoin addresses": addresses | ||
} | ||
self.table1.insert_one(record) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
services: | ||
mongodb: | ||
image: mongo:4.4 | ||
container_name: darkweb_mongo | ||
restart: always | ||
ports: | ||
- "27017:27017" | ||
volumes: | ||
- mongo_data:/data/db | ||
|
||
crawler: | ||
build: . | ||
container_name: darkweb_crawler | ||
depends_on: | ||
- mongodb | ||
environment: | ||
- TOR_PATH=/opt/tor-browser_en-US/ | ||
volumes: | ||
- .:/app | ||
restart: always | ||
|
||
volumes: | ||
mongo_data: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from database import Database | ||
from crawler import Crawler | ||
from config import DB_NAME, TOR_PATH, URL | ||
|
||
def main(): | ||
db = Database(DB_NAME) | ||
crawler = Crawler(TOR_PATH, db) | ||
|
||
# Insert the initial seed URL into the database | ||
db.insert_link(URL, visited=1) | ||
|
||
# Start crawling process | ||
next_url = db.get_next_link() | ||
crawler.crawl(next_url) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
tbselenium | ||
selenium | ||
pymongo |