Skip to content

Commit

Permalink
new updated files
Browse files Browse the repository at this point in the history
  • Loading branch information
harshdM99 committed Feb 15, 2025
1 parent 11de432 commit 3f29993
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 0 deletions.
72 changes: 72 additions & 0 deletions crawler/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# # Use an official lightweight Python image as a base
# FROM python:3.9-slim

# # Set environment variables
# ENV TOR_PATH="/opt/tor-browser_en-US/"

# # Install necessary packages
# RUN apt-get update && apt-get install -y \
# tor \
# wget \
# xvfb \
# x11-utils \
# xdg-utils \
# gnupg \
# curl \
# unzip \
# && apt-get clean

# # Add MongoDB repository and install the MongoDB shell (`mongosh`)
# RUN curl -fsSL https://www.mongodb.org/static/pgp/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-keyring.gpg \
# && echo "deb [signed-by=/usr/share/keyrings/mongodb-keyring.gpg] https://repo.mongodb.org/apt/debian bullseye/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \
# && apt-get update \
# && apt-get install -y mongodb-org-shell

# # Install Python dependencies
# COPY requirements.txt .
# RUN pip install --no-cache-dir -r requirements.txt

# # Create Tor Browser directory
# RUN mkdir -p $TOR_PATH

# RUN apt-get install -y xz-utils file

# # Dynamically fetch the latest Tor Browser version
# RUN export TOR_VERSION=$(curl -fsSL https://dist.torproject.org/torbrowser/ | grep -o 'tor-browser-linux64-[0-9.]\+_en-US.tar.xz' | head -n 1) && \
# echo "Downloading Tor Browser version: $TOR_VERSION" && \
# curl -fsSL -o /tmp/tor.tar.xz "https://dist.torproject.org/torbrowser/$TOR_VERSION" || \
# curl -fsSL -o /tmp/tor.tar.xz "https://mirror.torproject.org/torbrowser/$TOR_VERSION"

# # ✅ Verify the file before extracting
# RUN file /tmp/tor.tar.xz && \
# tar -xf /tmp/tor.tar.xz -C /opt/ && \
# rm /tmp/tor.tar.xz

# # Copy project files
# WORKDIR /app
# COPY . .

# # Expose MongoDB default port
# EXPOSE 27017

# # Start the Tor service and run the crawler
# CMD ["bash", "-c", "tor & sleep 5 && python main.py"]


FROM ubuntu:20.04

RUN apt update && apt install -y python3.8 python3-pip wget
RUN pip3 install tbselenium selenium pymongo

RUN wget https://www.torproject.org/dist/torbrowser/14.0.4/tor-browser-linux-x86_64-14.0.4.tar.xz
RUN tar -xf tor-browser-linux-x86_64-14.0.4.tar.xz

WORKDIR /app
COPY . .

EXPOSE 27017

# tor path
# /tor-browser/Browser/firefox

# ENTRYPOINT [ "python3 /app/main.py" ]
Empty file added crawler/Dockerfile2.txt
Empty file.
39 changes: 39 additions & 0 deletions crawler/bitcoin_address_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from hashlib import sha256

class BitcoinAddressExtractor:
digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'

@staticmethod
def decode_base58(bc, length):
n = 0
for char in bc:
n = n * 58 + BitcoinAddressExtractor.digits58.index(char)
return n.to_bytes(length, 'big')

@staticmethod
def check_bc(bc):
'''
Check if the input text is a valid bitcoin or not
'''
try:
bcbytes = BitcoinAddressExtractor.decode_base58(bc, 25)
# Verify checksum
return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
except Exception:
return False

@staticmethod
def extract_addresses(text):
addresses = []
i = 0
while i <= len(text) - 26:
# Check if the starting character is '1' or '3'
if text[i] == '1' or text[i] == '3':
# Try lengths from 26 to 35 (Bitcoin address range)
for win_len in range(26, 36):
potential_address = text[i:i+win_len]
if BitcoinAddressExtractor.check_bc(potential_address):
addresses.append(potential_address)
break
i += 1
return addresses
5 changes: 5 additions & 0 deletions crawler/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Update based on your system
CRAWL_LIMIT = 10000
TOR_PATH = "/opt/tor-browser_en-US/" # path to tor browser
DB_NAME = "crawlerData1"
URL = 'http://thehiddenwiki.org/'
62 changes: 62 additions & 0 deletions crawler/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# crawler.py
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from bitcoin_address_extractor import BitcoinAddressExtractor
from config import CRAWL_LIMIT

class Crawler:
def __init__(self, tor_path, db):
# self.driver = TorBrowserDriver(tor_path)
# self.db = db
options = Options()
options.headless = True # Run in headless mode (no GUI)

# Configure Firefox to use Tor's SOCKS5 proxy
options.set_preference("network.proxy.type", 1)
options.set_preference("network.proxy.socks", "127.0.0.1")
options.set_preference("network.proxy.socks_port", 9050)
options.set_preference("network.proxy.socks_remote_dns", True)

# Start a headless Firefox browser
self.driver = webdriver.Firefox(options=options)

def crawl(self, url):
for _ in range(CRAWL_LIMIT):
try:
self.driver.get(url)
all_links = self.get_all_links()
text = self.get_page_text()
addresses = BitcoinAddressExtractor.extract_addresses(text)

if addresses:
metadata = self.get_metadata()
self.db.insert_page_data(url, metadata, addresses)

self.db.update_link_visited(url)
url = self.db.get_next_link()

except Exception as e:
print(f"Error while processing {url}: {e}")
self.db.update_link_visited(url)
url = self.db.get_next_link()

def get_all_links(self):
all_links = []
elems = self.driver.find_elements_by_xpath("//a[@href]")
for elem in elems:
link = elem.get_attribute("href")
match = re.search('.onion', link)
if match:
all_links.append(link)
return all_links

def get_page_text(self):
return self.driver.find_element_by_xpath("/html/body").text

def get_metadata(self):
try:
return self.driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content")
except NoSuchElementException:
return []
28 changes: 28 additions & 0 deletions crawler/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pymongo import MongoClient

class Database:
def __init__(self, db_name):
self.client = MongoClient('localhost', 27017)
self.db = self.client[db_name]
self.table1 = self.db['websites']
self.table2 = self.db['crawl_links']

def insert_link(self, link, visited=0):
self.table2.insert_one({"link": link, "visited": visited})

def get_next_link(self):
query = {"visited": 0}
return self.table2.find_one(query)['link']

def update_link_visited(self, link):
query = {"link": link}
new_value = {"$set": {"visited": 1}}
self.table2.update_one(query, new_value)

def insert_page_data(self, url, metadata, addresses):
record = {
"link": url,
"metadata": metadata,
"bitcoin addresses": addresses
}
self.table1.insert_one(record)
23 changes: 23 additions & 0 deletions crawler/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
services:
mongodb:
image: mongo:4.4
container_name: darkweb_mongo
restart: always
ports:
- "27017:27017"
volumes:
- mongo_data:/data/db

crawler:
build: .
container_name: darkweb_crawler
depends_on:
- mongodb
environment:
- TOR_PATH=/opt/tor-browser_en-US/
volumes:
- .:/app
restart: always

volumes:
mongo_data:
17 changes: 17 additions & 0 deletions crawler/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from database import Database
from crawler import Crawler
from config import DB_NAME, TOR_PATH, URL

def main():
db = Database(DB_NAME)
crawler = Crawler(TOR_PATH, db)

# Insert the initial seed URL into the database
db.insert_link(URL, visited=1)

# Start crawling process
next_url = db.get_next_link()
crawler.crawl(next_url)

if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions crawler/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tbselenium
selenium
pymongo

0 comments on commit 3f29993

Please sign in to comment.