-
Notifications
You must be signed in to change notification settings - Fork 505
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* retruscture * 5-inference added * 5-inference added * added .gitignore * refactoring * Solved Embeding insertion into qdrant vector index * Added pulimi remaining resources --------- Co-authored-by: Vesa Alexandru <vesaalexandru95@gmail.com>
- Loading branch information
1 parent
8590e4e
commit bd5f14d
Showing
132 changed files
with
12,172 additions
and
352 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Use an official Python runtime as a parent image | ||
FROM python:3.11-slim-bullseye | ||
|
||
ENV WORKSPACE_ROOT=/usr/src/app \ | ||
PYTHONDONTWRITEBYTECODE=1 \ | ||
PYTHONUNBUFFERED=1 \ | ||
POETRY_HOME="/opt/poetry" \ | ||
POETRY_NO_INTERACTION=1 | ||
|
||
RUN mkdir -p $WORKSPACE_ROOT | ||
|
||
# Install system dependencies | ||
RUN apt-get update -y \ | ||
&& apt-get install -y --no-install-recommends build-essential \ | ||
gcc \ | ||
python3-dev \ | ||
curl \ | ||
build-essential \ | ||
&& apt-get clean | ||
|
||
# Install Poetry | ||
RUN curl -sSL https://install.python-poetry.org | python - | ||
|
||
# Add Poetry to PATH | ||
ENV PATH="$POETRY_HOME/bin:$PATH" | ||
|
||
RUN apt-get remove -y curl | ||
|
||
# Copy the pyproject.toml and poetry.lock files from the root directory | ||
COPY ./pyproject.toml ./poetry.lock ./ | ||
|
||
# Install dependencies | ||
RUN poetry config virtualenvs.create false && poetry install | ||
|
||
# Set the working directory | ||
WORKDIR $WORKSPACE_ROOT | ||
|
||
# Copy the 3-feature-pipeline and any other necessary directories | ||
COPY ./3-feature-pipeline . | ||
COPY ./core ./core | ||
|
||
# Set the PYTHONPATH environment variable | ||
ENV PYTHONPATH=/usr/src/app | ||
|
||
RUN chmod +x /usr/src/app/scripts/bytewax_entrypoint.sh | ||
|
||
# Command to run the Bytewax pipeline script | ||
CMD ["/usr/src/app/scripts/bytewax_entrypoint.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Use an official Python runtime as a parent image | ||
FROM python:3.11-slim | ||
|
||
# Install system dependencies | ||
RUN apt-get update && apt-get install -y \ | ||
gcc \ | ||
python3-dev \ | ||
curl \ | ||
build-essential \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
# Install Poetry | ||
RUN pip install poetry | ||
# Add Poetry to PATH | ||
ENV PATH="/etc/poetry/bin:$PATH" | ||
# Set the working directory | ||
WORKDIR /app | ||
# Copy the pyproject.toml and poetry.lock files from the root directory | ||
COPY ./pyproject.toml ./poetry.lock ./ | ||
# Install dependencies | ||
RUN poetry install --no-root | ||
# Copy the 2-data-ingestion and core directories | ||
COPY ./2-data-ingestion ./2-data-ingestion | ||
# Set the PYTHONPATH environment variable | ||
ENV PYTHONPATH=/app | ||
# Command to run the script | ||
CMD poetry run python /app/2-data-ingestion/cdc.py && tail -f /dev/null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,6 +162,10 @@ cython_debug/ | |
# Ruff | ||
.ruff_cache | ||
|
||
data/ | ||
dataset/ | ||
data | ||
|
||
# Data | ||
output | ||
.cache | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
3.11.4 |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from pydantic_settings import BaseSettings, SettingsConfigDict | ||
|
||
|
||
class Settings(BaseSettings): | ||
model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8") | ||
|
||
# MongoDB configs | ||
MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set" | ||
MONGO_DATABASE_NAME: str = "scrabble" | ||
|
||
|
||
settings = Settings() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .github import GithubCrawler | ||
from .linkedin import LinkedInCrawler | ||
from .medium import MediumCrawler | ||
|
||
__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import time | ||
from abc import ABC, abstractmethod | ||
from tempfile import mkdtemp | ||
|
||
from selenium import webdriver | ||
from selenium.webdriver.chrome.options import Options | ||
|
||
from db.documents import BaseDocument | ||
|
||
|
||
class BaseCrawler(ABC): | ||
model: type[BaseDocument] | ||
|
||
@abstractmethod | ||
def extract(self, link: str, **kwargs) -> None: ... | ||
|
||
|
||
class BaseAbstractCrawler(BaseCrawler, ABC): | ||
def __init__(self, scroll_limit: int = 5) -> None: | ||
options = webdriver.ChromeOptions() | ||
options.binary_location = "/opt/chrome/chrome" | ||
options.add_argument("--no-sandbox") | ||
options.add_argument("--headless=new") | ||
options.add_argument("--single-process") | ||
options.add_argument("--disable-dev-shm-usage") | ||
options.add_argument("--disable-gpu") | ||
options.add_argument("--log-level=3") | ||
options.add_argument("--disable-popup-blocking") | ||
options.add_argument("--disable-notifications") | ||
options.add_argument("--disable-dev-tools") | ||
options.add_argument("--ignore-certificate-errors") | ||
options.add_argument("--no-zygote") | ||
options.add_argument(f"--user-data-dir={mkdtemp()}") | ||
options.add_argument(f"--data-path={mkdtemp()}") | ||
options.add_argument(f"--disk-cache-dir={mkdtemp()}") | ||
options.add_argument("--remote-debugging-port=9222") | ||
|
||
self.set_extra_driver_options(options) | ||
|
||
self.scroll_limit = scroll_limit | ||
self.driver = webdriver.Chrome( | ||
service=webdriver.ChromeService("/opt/chromedriver"), | ||
options=options, | ||
) | ||
|
||
def set_extra_driver_options(self, options: Options) -> None: | ||
pass | ||
|
||
def login(self) -> None: | ||
pass | ||
|
||
def scroll_page(self) -> None: | ||
"""Scroll through the LinkedIn page based on the scroll limit.""" | ||
current_scroll = 0 | ||
last_height = self.driver.execute_script("return document.body.scrollHeight") | ||
while True: | ||
self.driver.execute_script( | ||
"window.scrollTo(0, document.body.scrollHeight);" | ||
) | ||
time.sleep(5) | ||
new_height = self.driver.execute_script("return document.body.scrollHeight") | ||
if new_height == last_height or ( | ||
self.scroll_limit and current_scroll >= self.scroll_limit | ||
): | ||
break | ||
last_height = new_height | ||
current_scroll += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
import shutil | ||
import subprocess | ||
import tempfile | ||
|
||
from aws_lambda_powertools import Logger | ||
|
||
from crawlers.base import BaseCrawler | ||
from db.documents import RepositoryDocument | ||
|
||
logger = Logger(service="llm-twin-course/crawler") | ||
|
||
|
||
class GithubCrawler(BaseCrawler): | ||
model = RepositoryDocument | ||
|
||
def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None: | ||
super().__init__() | ||
self._ignore = ignore | ||
|
||
def extract(self, link: str, **kwargs) -> None: | ||
logger.info(f"Starting scrapping GitHub repository: {link}") | ||
|
||
repo_name = link.rstrip("/").split("/")[-1] | ||
|
||
local_temp = tempfile.mkdtemp() | ||
|
||
try: | ||
os.chdir(local_temp) | ||
subprocess.run(["git", "clone", link]) | ||
|
||
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) | ||
|
||
tree = {} | ||
for root, dirs, files in os.walk(repo_path): | ||
dir = root.replace(repo_path, "").lstrip("/") | ||
if dir.startswith(self._ignore): | ||
continue | ||
|
||
for file in files: | ||
if file.endswith(self._ignore): | ||
continue | ||
file_path = os.path.join(dir, file) | ||
with open(os.path.join(root, file), "r", errors="ignore") as f: | ||
tree[file_path] = f.read().replace(" ", "") | ||
|
||
instance = self.model( | ||
name=repo_name, link=link, content=tree, owner_id=kwargs.get("user") | ||
) | ||
instance.save() | ||
|
||
except Exception: | ||
raise | ||
finally: | ||
shutil.rmtree(local_temp) | ||
|
||
logger.info(f"Finished scrapping GitHub repository: {link}") |
Oops, something went wrong.