Skip to content

Commit

Permalink
Restructure repository (#26)
Browse files Browse the repository at this point in the history
* retruscture

* 5-inference added

* 5-inference added

* added .gitignore

* refactoring

* Solved Embeding insertion into qdrant vector index

* Added pulimi remaining resources

---------

Co-authored-by: Vesa Alexandru <vesaalexandru95@gmail.com>
  • Loading branch information
rsergiuistoc and alexandruvesa authored Jul 5, 2024
1 parent 8590e4e commit bd5f14d
Show file tree
Hide file tree
Showing 132 changed files with 12,172 additions and 352 deletions.
48 changes: 48 additions & 0 deletions .docker/Dockerfile.bytewax
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Use an official Python runtime as a parent image
FROM python:3.11-slim-bullseye

ENV WORKSPACE_ROOT=/usr/src/app \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
POETRY_HOME="/opt/poetry" \
POETRY_NO_INTERACTION=1

RUN mkdir -p $WORKSPACE_ROOT

# Install system dependencies
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends build-essential \
gcc \
python3-dev \
curl \
build-essential \
&& apt-get clean

# Install Poetry
RUN curl -sSL https://install.python-poetry.org | python -

# Add Poetry to PATH
ENV PATH="$POETRY_HOME/bin:$PATH"

RUN apt-get remove -y curl

# Copy the pyproject.toml and poetry.lock files from the root directory
COPY ./pyproject.toml ./poetry.lock ./

# Install dependencies
RUN poetry config virtualenvs.create false && poetry install

# Set the working directory
WORKDIR $WORKSPACE_ROOT

# Copy the 3-feature-pipeline and any other necessary directories
COPY ./3-feature-pipeline .
COPY ./core ./core

# Set the PYTHONPATH environment variable
ENV PYTHONPATH=/usr/src/app

RUN chmod +x /usr/src/app/scripts/bytewax_entrypoint.sh

# Command to run the Bytewax pipeline script
CMD ["/usr/src/app/scripts/bytewax_entrypoint.sh"]
35 changes: 35 additions & 0 deletions .docker/Dockerfile.cdc
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Use an official Python runtime as a parent image
FROM python:3.11-slim

# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
python3-dev \
curl \
build-essential \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Poetry
RUN pip install poetry
# Add Poetry to PATH
ENV PATH="/etc/poetry/bin:$PATH"
# Set the working directory
WORKDIR /app
# Copy the pyproject.toml and poetry.lock files from the root directory
COPY ./pyproject.toml ./poetry.lock ./
# Install dependencies
RUN poetry install --no-root
# Copy the 2-data-ingestion and core directories
COPY ./2-data-ingestion ./2-data-ingestion
# Set the PYTHONPATH environment variable
ENV PYTHONPATH=/app
# Command to run the script
CMD poetry run python /app/2-data-ingestion/cdc.py && tail -f /dev/null
13 changes: 7 additions & 6 deletions course/module-1/Dockerfile → .docker/Dockerfile.crawlers
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,21 @@ RUN yum install -y \
COPY --from=build /opt/chrome-linux /opt/chrome
COPY --from=build /opt/chromedriver /opt/

COPY poetry.lock pyproject.toml ./
COPY ./pyproject.toml ./poetry.lock ./

# Install Poetry, export dependencies to requirements.txt, and install dependencies
# in the Lambda task directory, finally cleanup manifest files.
RUN python3 -m pip install --upgrade pip && pip3 install poetry
RUN poetry export -f requirements.txt > requirements.txt && \
pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
RUN python -m pip install --upgrade pip && pip install poetry
RUN poetry export --without 3-feature-pipeline,ml -f requirements.txt > requirements.txt && \
pip install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
rm requirements.txt pyproject.toml poetry.lock

# Optional TLS CA only if you plan to store the extracted data into Document DB
RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}
ENV PYTHONPATH=${LAMBDA_TASK_ROOT}/1-data-crawling

# Copy function code
COPY . ${LAMBDA_TASK_ROOT}
COPY ./1-data-crawling ${LAMBDA_TASK_ROOT}/1-data-crawling

# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
CMD ["main.handler"]
CMD ["1-data-crawling.main.handler"]
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ cython_debug/
# Ruff
.ruff_cache

data/
dataset/
data

# Data
output
.cache
Expand Down
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11.4
File renamed without changes.
12 changes: 12 additions & 0 deletions 1-data-crawling/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")

# MongoDB configs
MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
MONGO_DATABASE_NAME: str = "scrabble"


settings = Settings()
5 changes: 5 additions & 0 deletions 1-data-crawling/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .github import GithubCrawler
from .linkedin import LinkedInCrawler
from .medium import MediumCrawler

__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
67 changes: 67 additions & 0 deletions 1-data-crawling/crawlers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import time
from abc import ABC, abstractmethod
from tempfile import mkdtemp

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from db.documents import BaseDocument


class BaseCrawler(ABC):
model: type[BaseDocument]

@abstractmethod
def extract(self, link: str, **kwargs) -> None: ...


class BaseAbstractCrawler(BaseCrawler, ABC):
def __init__(self, scroll_limit: int = 5) -> None:
options = webdriver.ChromeOptions()
options.binary_location = "/opt/chrome/chrome"
options.add_argument("--no-sandbox")
options.add_argument("--headless=new")
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--log-level=3")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-notifications")
options.add_argument("--disable-dev-tools")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--no-zygote")
options.add_argument(f"--user-data-dir={mkdtemp()}")
options.add_argument(f"--data-path={mkdtemp()}")
options.add_argument(f"--disk-cache-dir={mkdtemp()}")
options.add_argument("--remote-debugging-port=9222")

self.set_extra_driver_options(options)

self.scroll_limit = scroll_limit
self.driver = webdriver.Chrome(
service=webdriver.ChromeService("/opt/chromedriver"),
options=options,
)

def set_extra_driver_options(self, options: Options) -> None:
pass

def login(self) -> None:
pass

def scroll_page(self) -> None:
"""Scroll through the LinkedIn page based on the scroll limit."""
current_scroll = 0
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);"
)
time.sleep(5)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height or (
self.scroll_limit and current_scroll >= self.scroll_limit
):
break
last_height = new_height
current_scroll += 1
57 changes: 57 additions & 0 deletions 1-data-crawling/crawlers/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import shutil
import subprocess
import tempfile

from aws_lambda_powertools import Logger

from crawlers.base import BaseCrawler
from db.documents import RepositoryDocument

logger = Logger(service="llm-twin-course/crawler")


class GithubCrawler(BaseCrawler):
model = RepositoryDocument

def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
super().__init__()
self._ignore = ignore

def extract(self, link: str, **kwargs) -> None:
logger.info(f"Starting scrapping GitHub repository: {link}")

repo_name = link.rstrip("/").split("/")[-1]

local_temp = tempfile.mkdtemp()

try:
os.chdir(local_temp)
subprocess.run(["git", "clone", link])

repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])

tree = {}
for root, dirs, files in os.walk(repo_path):
dir = root.replace(repo_path, "").lstrip("/")
if dir.startswith(self._ignore):
continue

for file in files:
if file.endswith(self._ignore):
continue
file_path = os.path.join(dir, file)
with open(os.path.join(root, file), "r", errors="ignore") as f:
tree[file_path] = f.read().replace(" ", "")

instance = self.model(
name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
)
instance.save()

except Exception:
raise
finally:
shutil.rmtree(local_temp)

logger.info(f"Finished scrapping GitHub repository: {link}")
Loading

0 comments on commit bd5f14d

Please sign in to comment.