Skip to content

Commit

Permalink
read env file
Browse files Browse the repository at this point in the history
  • Loading branch information
harshdM99 committed Feb 17, 2025
1 parent 187b951 commit 51424bc
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 7 deletions.
2 changes: 1 addition & 1 deletion crawler/.env
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ CRAWL_LIMIT = 10000
DB_NAME = "crawlerData1"
CRAWLING_TABLE = "crawl_links"
ADDRESS_TABLE = "websites"
URL = "http://thehiddeanwiki.org/"
SEED_URL = "http://thehiddeanwiki.org/"
26 changes: 22 additions & 4 deletions crawler/database.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,29 @@
from pymongo import MongoClient

class Database:
def __init__(self, db_name, crawling_table, address_table):
def __init__(self, DB_NAME, CRAWLING_TABLE, ADDRESS_TABLE, SEED_URL):
self.client = MongoClient('mongodb', 27017)
self.db = self.client[db_name]
self.address_table = self.db[address_table]
self.crawling_table = self.db[crawling_table]
self.db = self.client[DB_NAME]

self.create_collections(CRAWLING_TABLE, ADDRESS_TABLE)

self.address_table = self.db[ADDRESS_TABLE]
self.crawling_table = self.db[CRAWLING_TABLE]

# Insert seed URL if no unvisited links exist
if self.crawling_table.count_documents({}) == 0:
self.crawling_table.insert_one({"link": SEED_URL, "visited": 0})
print(f"Seed URL inserted: {SEED_URL}")

def create_collections(self, CRAWLING_TABLE, ADDRESS_TABLE):
"""Ensure collections exist and insert a seed URL if needed."""
if CRAWLING_TABLE not in self.db.list_collection_names():
self.db.create_collection(CRAWLING_TABLE)
print(f"Created collection: {CRAWLING_TABLE}")

if ADDRESS_TABLE not in self.db.list_collection_names():
self.db.create_collection(ADDRESS_TABLE)
print(f"Created collection: {ADDRESS_TABLE}")

def insert_link(self, link, visited=0):
self.crawling_table.insert_one({"link": link, "visited": visited})
Expand Down
4 changes: 2 additions & 2 deletions crawler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
DB_NAME = os.getenv("DB_NAME", "crawlerData1")
CRAWLING_TABLE = os.getenv("CRAWLING_TABLE", "crawl_links")
ADDRESS_TABLE = os.getenv("ADDRESS_TABLE", "websites")
SEED_URL = os.getenv("SEED_URL", "http://thehiddenwiki.org/")
CRAWL_LIMIT = os.getenv("CRAWL_LIMIT", 10000)
URL = os.getenv("URL", "http://thehiddeanwiki.org/")

def main():
db = Database(DB_NAME, CRAWLING_TABLE, ADDRESS_TABLE)
db = Database(DB_NAME, CRAWLING_TABLE, ADDRESS_TABLE, SEED_URL)
crawler = Crawler(db, CRAWL_LIMIT)

# Start crawling process
Expand Down

0 comments on commit 51424bc

Please sign in to comment.