Skip to content

Commit

Permalink
Load Preprocessind Data
Browse files Browse the repository at this point in the history
  • Loading branch information
darkanita committed Jun 9, 2024
1 parent fa74f9e commit 16c39e8
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 0 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/1. Load_Preprocessing_Data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: 1. Load Preprocessing Data

on:
push:
branches:
- main
paths:
- 'src/data_pipelines/preprocessing.py'

jobs:
login:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip' # caching pip dependencies
- run: pip install -r requirements.txt
- name: Login to WandB
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
run: wandb login $WANDB_API_KEY
- name: Run load.py
env:
HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
run: python src/data/preprocessing.py --IdExecution ${{ github.run_number }}
43 changes: 43 additions & 0 deletions src/data_pipelines/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import argparse
import langchain
import wandb
#from dotenv import load_dotenv, find_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

#load_dotenv(find_dotenv())

model_name = "jinaai/jina-embeddings-v2-small-en"

embeddings = HuggingFaceEmbeddings(
model_name=model_name
)

parser = argparse.ArgumentParser()
parser.add_argument('--IdExecution', type=str, help='ID of the execution')
args = parser.parse_args()

if args.IdExecution:
print(f"IdExecution: {args.IdExecution}")

with wandb.init(project="LLMOps-Pycon2024",name=f"Preprocess Data ExecId-{args.IdExecution}", job_type="preprocess-data") as run:
file_path = "src/data/1810.04805v2.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings,persist_directory="src/rag_db")
# 📦 save the vector database to the artifact
vectorstore_artifact = wandb.Artifact(
"vector-database", type="dataset", description="Vector Database for RAG model",
metadata={"source": file_path,
"sizes": len(docs),
"embedding": model_name,
"chunks": len(splits),
"destined_for": "rag-model"})
vectorstore_artifact.add_dir("src/rag_db")
run.log_artifact(vectorstore_artifact)

0 comments on commit 16c39e8

Please sign in to comment.