From 3f299938eb35bce5d0e17b6766ae6a34a8963f77 Mon Sep 17 00:00:00 2001 From: Harshdeep Mishra Date: Fri, 14 Feb 2025 19:33:07 -0500 Subject: [PATCH] new updated files --- crawler/Dockerfile | 72 ++++++++++++++++++++++++++++ crawler/Dockerfile2.txt | 0 crawler/bitcoin_address_extractor.py | 39 +++++++++++++++ crawler/config.py | 5 ++ crawler/crawler.py | 62 ++++++++++++++++++++++++ crawler/database.py | 28 +++++++++++ crawler/docker-compose.yml | 23 +++++++++ crawler/main.py | 17 +++++++ crawler/requirements.txt | 3 ++ 9 files changed, 249 insertions(+) create mode 100644 crawler/Dockerfile create mode 100644 crawler/Dockerfile2.txt create mode 100644 crawler/bitcoin_address_extractor.py create mode 100644 crawler/config.py create mode 100644 crawler/crawler.py create mode 100644 crawler/database.py create mode 100644 crawler/docker-compose.yml create mode 100644 crawler/main.py create mode 100644 crawler/requirements.txt diff --git a/crawler/Dockerfile b/crawler/Dockerfile new file mode 100644 index 0000000..1037a76 --- /dev/null +++ b/crawler/Dockerfile @@ -0,0 +1,72 @@ +# # Use an official lightweight Python image as a base +# FROM python:3.9-slim + +# # Set environment variables +# ENV TOR_PATH="/opt/tor-browser_en-US/" + +# # Install necessary packages +# RUN apt-get update && apt-get install -y \ +# tor \ +# wget \ +# xvfb \ +# x11-utils \ +# xdg-utils \ +# gnupg \ +# curl \ +# unzip \ +# && apt-get clean + +# # Add MongoDB repository and install the MongoDB shell (`mongosh`) +# RUN curl -fsSL https://www.mongodb.org/static/pgp/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-keyring.gpg \ +# && echo "deb [signed-by=/usr/share/keyrings/mongodb-keyring.gpg] https://repo.mongodb.org/apt/debian bullseye/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \ +# && apt-get update \ +# && apt-get install -y mongodb-org-shell + +# # Install Python dependencies +# COPY requirements.txt . +# RUN pip install --no-cache-dir -r requirements.txt + +# # Create Tor Browser directory +# RUN mkdir -p $TOR_PATH + +# RUN apt-get install -y xz-utils file + +# # Dynamically fetch the latest Tor Browser version +# RUN export TOR_VERSION=$(curl -fsSL https://dist.torproject.org/torbrowser/ | grep -o 'tor-browser-linux64-[0-9.]\+_en-US.tar.xz' | head -n 1) && \ +# echo "Downloading Tor Browser version: $TOR_VERSION" && \ +# curl -fsSL -o /tmp/tor.tar.xz "https://dist.torproject.org/torbrowser/$TOR_VERSION" || \ +# curl -fsSL -o /tmp/tor.tar.xz "https://mirror.torproject.org/torbrowser/$TOR_VERSION" + +# # ✅ Verify the file before extracting +# RUN file /tmp/tor.tar.xz && \ +# tar -xf /tmp/tor.tar.xz -C /opt/ && \ +# rm /tmp/tor.tar.xz + +# # Copy project files +# WORKDIR /app +# COPY . . + +# # Expose MongoDB default port +# EXPOSE 27017 + +# # Start the Tor service and run the crawler +# CMD ["bash", "-c", "tor & sleep 5 && python main.py"] + + +FROM ubuntu:20.04 + +RUN apt update && apt install -y python3.8 python3-pip wget +RUN pip3 install tbselenium selenium pymongo + +RUN wget https://www.torproject.org/dist/torbrowser/14.0.4/tor-browser-linux-x86_64-14.0.4.tar.xz +RUN tar -xf tor-browser-linux-x86_64-14.0.4.tar.xz + +WORKDIR /app +COPY . . + +EXPOSE 27017 + +# tor path +# /tor-browser/Browser/firefox + +# ENTRYPOINT [ "python3 /app/main.py" ] \ No newline at end of file diff --git a/crawler/Dockerfile2.txt b/crawler/Dockerfile2.txt new file mode 100644 index 0000000..e69de29 diff --git a/crawler/bitcoin_address_extractor.py b/crawler/bitcoin_address_extractor.py new file mode 100644 index 0000000..7ea67c9 --- /dev/null +++ b/crawler/bitcoin_address_extractor.py @@ -0,0 +1,39 @@ +from hashlib import sha256 + +class BitcoinAddressExtractor: + digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + + @staticmethod + def decode_base58(bc, length): + n = 0 + for char in bc: + n = n * 58 + BitcoinAddressExtractor.digits58.index(char) + return n.to_bytes(length, 'big') + + @staticmethod + def check_bc(bc): + ''' + Check if the input text is a valid bitcoin or not + ''' + try: + bcbytes = BitcoinAddressExtractor.decode_base58(bc, 25) + # Verify checksum + return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4] + except Exception: + return False + + @staticmethod + def extract_addresses(text): + addresses = [] + i = 0 + while i <= len(text) - 26: + # Check if the starting character is '1' or '3' + if text[i] == '1' or text[i] == '3': + # Try lengths from 26 to 35 (Bitcoin address range) + for win_len in range(26, 36): + potential_address = text[i:i+win_len] + if BitcoinAddressExtractor.check_bc(potential_address): + addresses.append(potential_address) + break + i += 1 + return addresses \ No newline at end of file diff --git a/crawler/config.py b/crawler/config.py new file mode 100644 index 0000000..bb772ac --- /dev/null +++ b/crawler/config.py @@ -0,0 +1,5 @@ +# Update based on your system +CRAWL_LIMIT = 10000 +TOR_PATH = "/opt/tor-browser_en-US/" # path to tor browser +DB_NAME = "crawlerData1" +URL = 'http://thehiddenwiki.org/' \ No newline at end of file diff --git a/crawler/crawler.py b/crawler/crawler.py new file mode 100644 index 0000000..67ef0c3 --- /dev/null +++ b/crawler/crawler.py @@ -0,0 +1,62 @@ +# crawler.py +import re +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.common.exceptions import NoSuchElementException +from bitcoin_address_extractor import BitcoinAddressExtractor +from config import CRAWL_LIMIT + +class Crawler: + def __init__(self, tor_path, db): + # self.driver = TorBrowserDriver(tor_path) + # self.db = db + options = Options() + options.headless = True # Run in headless mode (no GUI) + + # Configure Firefox to use Tor's SOCKS5 proxy + options.set_preference("network.proxy.type", 1) + options.set_preference("network.proxy.socks", "127.0.0.1") + options.set_preference("network.proxy.socks_port", 9050) + options.set_preference("network.proxy.socks_remote_dns", True) + + # Start a headless Firefox browser + self.driver = webdriver.Firefox(options=options) + + def crawl(self, url): + for _ in range(CRAWL_LIMIT): + try: + self.driver.get(url) + all_links = self.get_all_links() + text = self.get_page_text() + addresses = BitcoinAddressExtractor.extract_addresses(text) + + if addresses: + metadata = self.get_metadata() + self.db.insert_page_data(url, metadata, addresses) + + self.db.update_link_visited(url) + url = self.db.get_next_link() + + except Exception as e: + print(f"Error while processing {url}: {e}") + self.db.update_link_visited(url) + url = self.db.get_next_link() + + def get_all_links(self): + all_links = [] + elems = self.driver.find_elements_by_xpath("//a[@href]") + for elem in elems: + link = elem.get_attribute("href") + match = re.search('.onion', link) + if match: + all_links.append(link) + return all_links + + def get_page_text(self): + return self.driver.find_element_by_xpath("/html/body").text + + def get_metadata(self): + try: + return self.driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content") + except NoSuchElementException: + return [] \ No newline at end of file diff --git a/crawler/database.py b/crawler/database.py new file mode 100644 index 0000000..6fd367e --- /dev/null +++ b/crawler/database.py @@ -0,0 +1,28 @@ +from pymongo import MongoClient + +class Database: + def __init__(self, db_name): + self.client = MongoClient('localhost', 27017) + self.db = self.client[db_name] + self.table1 = self.db['websites'] + self.table2 = self.db['crawl_links'] + + def insert_link(self, link, visited=0): + self.table2.insert_one({"link": link, "visited": visited}) + + def get_next_link(self): + query = {"visited": 0} + return self.table2.find_one(query)['link'] + + def update_link_visited(self, link): + query = {"link": link} + new_value = {"$set": {"visited": 1}} + self.table2.update_one(query, new_value) + + def insert_page_data(self, url, metadata, addresses): + record = { + "link": url, + "metadata": metadata, + "bitcoin addresses": addresses + } + self.table1.insert_one(record) diff --git a/crawler/docker-compose.yml b/crawler/docker-compose.yml new file mode 100644 index 0000000..cffafec --- /dev/null +++ b/crawler/docker-compose.yml @@ -0,0 +1,23 @@ +services: + mongodb: + image: mongo:4.4 + container_name: darkweb_mongo + restart: always + ports: + - "27017:27017" + volumes: + - mongo_data:/data/db + + crawler: + build: . + container_name: darkweb_crawler + depends_on: + - mongodb + environment: + - TOR_PATH=/opt/tor-browser_en-US/ + volumes: + - .:/app + restart: always + +volumes: + mongo_data: diff --git a/crawler/main.py b/crawler/main.py new file mode 100644 index 0000000..1a9377f --- /dev/null +++ b/crawler/main.py @@ -0,0 +1,17 @@ +from database import Database +from crawler import Crawler +from config import DB_NAME, TOR_PATH, URL + +def main(): + db = Database(DB_NAME) + crawler = Crawler(TOR_PATH, db) + + # Insert the initial seed URL into the database + db.insert_link(URL, visited=1) + + # Start crawling process + next_url = db.get_next_link() + crawler.crawl(next_url) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawler/requirements.txt b/crawler/requirements.txt new file mode 100644 index 0000000..f032df1 --- /dev/null +++ b/crawler/requirements.txt @@ -0,0 +1,3 @@ +tbselenium +selenium +pymongo \ No newline at end of file