From 3f299938eb35bce5d0e17b6766ae6a34a8963f77 Mon Sep 17 00:00:00 2001
From: Harshdeep Mishra <harshdeepmishra82@gmail.com>
Date: Fri, 14 Feb 2025 19:33:07 -0500
Subject: [PATCH] new updated files

---
 crawler/Dockerfile                   | 72 ++++++++++++++++++++++++++++
 crawler/Dockerfile2.txt              |  0
 crawler/bitcoin_address_extractor.py | 39 +++++++++++++++
 crawler/config.py                    |  5 ++
 crawler/crawler.py                   | 62 ++++++++++++++++++++++++
 crawler/database.py                  | 28 +++++++++++
 crawler/docker-compose.yml           | 23 +++++++++
 crawler/main.py                      | 17 +++++++
 crawler/requirements.txt             |  3 ++
 9 files changed, 249 insertions(+)
 create mode 100644 crawler/Dockerfile
 create mode 100644 crawler/Dockerfile2.txt
 create mode 100644 crawler/bitcoin_address_extractor.py
 create mode 100644 crawler/config.py
 create mode 100644 crawler/crawler.py
 create mode 100644 crawler/database.py
 create mode 100644 crawler/docker-compose.yml
 create mode 100644 crawler/main.py
 create mode 100644 crawler/requirements.txt

diff --git a/crawler/Dockerfile b/crawler/Dockerfile
new file mode 100644
index 0000000..1037a76
--- /dev/null
+++ b/crawler/Dockerfile
@@ -0,0 +1,72 @@
+# # Use an official lightweight Python image as a base
+# FROM python:3.9-slim
+
+# # Set environment variables
+# ENV TOR_PATH="/opt/tor-browser_en-US/"
+
+# # Install necessary packages
+# RUN apt-get update && apt-get install -y \
+#     tor \
+#     wget \
+#     xvfb \
+#     x11-utils \
+#     xdg-utils \
+#     gnupg \
+#     curl \
+#     unzip \
+#     && apt-get clean
+
+# # Add MongoDB repository and install the MongoDB shell (`mongosh`)
+# RUN curl -fsSL https://www.mongodb.org/static/pgp/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-keyring.gpg \
+#     && echo "deb [signed-by=/usr/share/keyrings/mongodb-keyring.gpg] https://repo.mongodb.org/apt/debian bullseye/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \
+#     && apt-get update \
+#     && apt-get install -y mongodb-org-shell
+
+# # Install Python dependencies
+# COPY requirements.txt .
+# RUN pip install --no-cache-dir -r requirements.txt
+
+# # Create Tor Browser directory
+# RUN mkdir -p $TOR_PATH
+
+# RUN apt-get install -y xz-utils file
+
+# # Dynamically fetch the latest Tor Browser version
+# RUN export TOR_VERSION=$(curl -fsSL https://dist.torproject.org/torbrowser/ | grep -o 'tor-browser-linux64-[0-9.]\+_en-US.tar.xz' | head -n 1) && \
+#     echo "Downloading Tor Browser version: $TOR_VERSION" && \
+#     curl -fsSL -o /tmp/tor.tar.xz "https://dist.torproject.org/torbrowser/$TOR_VERSION" || \
+#     curl -fsSL -o /tmp/tor.tar.xz "https://mirror.torproject.org/torbrowser/$TOR_VERSION"
+
+# # ✅ Verify the file before extracting
+# RUN file /tmp/tor.tar.xz && \
+#     tar -xf /tmp/tor.tar.xz -C /opt/ && \
+#     rm /tmp/tor.tar.xz
+
+# # Copy project files
+# WORKDIR /app
+# COPY . .
+
+# # Expose MongoDB default port
+# EXPOSE 27017
+
+# # Start the Tor service and run the crawler
+# CMD ["bash", "-c", "tor & sleep 5 && python main.py"]
+
+
+FROM ubuntu:20.04
+
+RUN apt update && apt install -y python3.8 python3-pip wget
+RUN pip3 install tbselenium selenium pymongo
+
+RUN wget https://www.torproject.org/dist/torbrowser/14.0.4/tor-browser-linux-x86_64-14.0.4.tar.xz
+RUN tar -xf tor-browser-linux-x86_64-14.0.4.tar.xz
+
+WORKDIR /app
+COPY . .
+
+EXPOSE 27017
+
+# tor path
+# /tor-browser/Browser/firefox
+
+# ENTRYPOINT [ "python3 /app/main.py" ]
\ No newline at end of file
diff --git a/crawler/Dockerfile2.txt b/crawler/Dockerfile2.txt
new file mode 100644
index 0000000..e69de29
diff --git a/crawler/bitcoin_address_extractor.py b/crawler/bitcoin_address_extractor.py
new file mode 100644
index 0000000..7ea67c9
--- /dev/null
+++ b/crawler/bitcoin_address_extractor.py
@@ -0,0 +1,39 @@
+from hashlib import sha256
+
+class BitcoinAddressExtractor:
+    digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
+
+    @staticmethod
+    def decode_base58(bc, length):
+        n = 0
+        for char in bc:
+            n = n * 58 + BitcoinAddressExtractor.digits58.index(char)
+        return n.to_bytes(length, 'big')
+
+    @staticmethod
+    def check_bc(bc):
+        '''
+            Check if the input text is a valid bitcoin or not
+        '''
+        try:
+            bcbytes = BitcoinAddressExtractor.decode_base58(bc, 25)
+            # Verify checksum
+            return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
+        except Exception:
+            return False
+
+    @staticmethod
+    def extract_addresses(text):
+        addresses = []
+        i = 0
+        while i <= len(text) - 26:
+            # Check if the starting character is '1' or '3'
+            if text[i] == '1' or text[i] == '3':
+                # Try lengths from 26 to 35 (Bitcoin address range)
+                for win_len in range(26, 36):
+                    potential_address = text[i:i+win_len]
+                    if BitcoinAddressExtractor.check_bc(potential_address):
+                        addresses.append(potential_address)
+                        break
+            i += 1
+        return addresses
\ No newline at end of file
diff --git a/crawler/config.py b/crawler/config.py
new file mode 100644
index 0000000..bb772ac
--- /dev/null
+++ b/crawler/config.py
@@ -0,0 +1,5 @@
+# Update based on your system 
+CRAWL_LIMIT = 10000
+TOR_PATH = "/opt/tor-browser_en-US/" # path to tor browser
+DB_NAME = "crawlerData1"
+URL = 'http://thehiddenwiki.org/'
\ No newline at end of file
diff --git a/crawler/crawler.py b/crawler/crawler.py
new file mode 100644
index 0000000..67ef0c3
--- /dev/null
+++ b/crawler/crawler.py
@@ -0,0 +1,62 @@
+# crawler.py
+import re
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.common.exceptions import NoSuchElementException
+from bitcoin_address_extractor import BitcoinAddressExtractor
+from config import CRAWL_LIMIT
+
+class Crawler:
+    def __init__(self, tor_path, db):
+        # self.driver = TorBrowserDriver(tor_path)
+        # self.db = db
+        options = Options()
+        options.headless = True  # Run in headless mode (no GUI)
+        
+        # Configure Firefox to use Tor's SOCKS5 proxy
+        options.set_preference("network.proxy.type", 1)
+        options.set_preference("network.proxy.socks", "127.0.0.1")
+        options.set_preference("network.proxy.socks_port", 9050)
+        options.set_preference("network.proxy.socks_remote_dns", True)
+        
+        # Start a headless Firefox browser
+        self.driver = webdriver.Firefox(options=options)
+
+    def crawl(self, url):
+        for _ in range(CRAWL_LIMIT):
+            try:
+                self.driver.get(url)
+                all_links = self.get_all_links()
+                text = self.get_page_text()
+                addresses = BitcoinAddressExtractor.extract_addresses(text)
+
+                if addresses:
+                    metadata = self.get_metadata()
+                    self.db.insert_page_data(url, metadata, addresses)
+
+                self.db.update_link_visited(url)
+                url = self.db.get_next_link()
+
+            except Exception as e:
+                print(f"Error while processing {url}: {e}")
+                self.db.update_link_visited(url)
+                url = self.db.get_next_link()
+
+    def get_all_links(self):
+        all_links = []
+        elems = self.driver.find_elements_by_xpath("//a[@href]")
+        for elem in elems:
+            link = elem.get_attribute("href")
+            match = re.search('.onion', link)
+            if match:
+                all_links.append(link)
+        return all_links
+
+    def get_page_text(self):
+        return self.driver.find_element_by_xpath("/html/body").text
+
+    def get_metadata(self):
+        try:
+            return self.driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content")
+        except NoSuchElementException:
+            return []
\ No newline at end of file
diff --git a/crawler/database.py b/crawler/database.py
new file mode 100644
index 0000000..6fd367e
--- /dev/null
+++ b/crawler/database.py
@@ -0,0 +1,28 @@
+from pymongo import MongoClient
+
+class Database:
+    def __init__(self, db_name):
+        self.client = MongoClient('localhost', 27017)
+        self.db = self.client[db_name]
+        self.table1 = self.db['websites']
+        self.table2 = self.db['crawl_links']
+
+    def insert_link(self, link, visited=0):
+        self.table2.insert_one({"link": link, "visited": visited})
+
+    def get_next_link(self):
+        query = {"visited": 0}
+        return self.table2.find_one(query)['link']
+
+    def update_link_visited(self, link):
+        query = {"link": link}
+        new_value = {"$set": {"visited": 1}}
+        self.table2.update_one(query, new_value)
+
+    def insert_page_data(self, url, metadata, addresses):
+        record = {
+            "link": url,
+            "metadata": metadata,
+            "bitcoin addresses": addresses
+        }
+        self.table1.insert_one(record)
diff --git a/crawler/docker-compose.yml b/crawler/docker-compose.yml
new file mode 100644
index 0000000..cffafec
--- /dev/null
+++ b/crawler/docker-compose.yml
@@ -0,0 +1,23 @@
+services:
+  mongodb:
+    image: mongo:4.4
+    container_name: darkweb_mongo
+    restart: always
+    ports:
+      - "27017:27017"
+    volumes:
+      - mongo_data:/data/db
+
+  crawler:
+    build: .
+    container_name: darkweb_crawler
+    depends_on:
+      - mongodb
+    environment:
+      - TOR_PATH=/opt/tor-browser_en-US/
+    volumes:
+      - .:/app
+    restart: always
+
+volumes:
+  mongo_data:
diff --git a/crawler/main.py b/crawler/main.py
new file mode 100644
index 0000000..1a9377f
--- /dev/null
+++ b/crawler/main.py
@@ -0,0 +1,17 @@
+from database import Database
+from crawler import Crawler
+from config import DB_NAME, TOR_PATH, URL
+
+def main():
+    db = Database(DB_NAME)
+    crawler = Crawler(TOR_PATH, db)
+
+    # Insert the initial seed URL into the database
+    db.insert_link(URL, visited=1)
+
+    # Start crawling process
+    next_url = db.get_next_link()
+    crawler.crawl(next_url)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/crawler/requirements.txt b/crawler/requirements.txt
new file mode 100644
index 0000000..f032df1
--- /dev/null
+++ b/crawler/requirements.txt
@@ -0,0 +1,3 @@
+tbselenium
+selenium
+pymongo
\ No newline at end of file