Load Preprocessind Data

darkanita · Jun 9, 2024 · 16c39e8 · 16c39e8
1 parent fa74f9e
commit 16c39e8
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 0 deletions.
diff --git a/.github/workflows/1. Load_Preprocessing_Data.yml b/.github/workflows/1. Load_Preprocessing_Data.yml
@@ -0,0 +1,27 @@
+name: 1. Load Preprocessing Data
+
+on:
+    push:
+      branches:
+        - main
+      paths:
+        - 'src/data_pipelines/preprocessing.py'
+
+jobs:
+  login:
+    runs-on: ubuntu-latest
+    steps:
+        - uses: actions/checkout@v4 
+        - uses: actions/setup-python@v5
+          with:
+            python-version: '3.12'
+            cache: 'pip' # caching pip dependencies
+        - run: pip install -r requirements.txt
+        - name: Login to WandB
+          env:
+            WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+          run: wandb login $WANDB_API_KEY
+        - name: Run load.py
+          env:
+            HUGGINGFACEHUB_API_TOKEN: ${{ secrets.HUGGINGFACEHUB_API_TOKEN }}
+          run: python src/data/preprocessing.py --IdExecution ${{ github.run_number }}
diff --git a/src/data_pipelines/preprocessing.py b/src/data_pipelines/preprocessing.py
@@ -0,0 +1,43 @@
+import os
+import argparse
+import langchain
+import wandb
+#from dotenv import load_dotenv, find_dotenv
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_chroma import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+#load_dotenv(find_dotenv())
+
+model_name = "jinaai/jina-embeddings-v2-small-en"
+
+embeddings = HuggingFaceEmbeddings(
+    model_name=model_name
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--IdExecution', type=str, help='ID of the execution')
+args = parser.parse_args()
+
+if args.IdExecution:
+    print(f"IdExecution: {args.IdExecution}")
+
+with wandb.init(project="LLMOps-Pycon2024",name=f"Preprocess Data ExecId-{args.IdExecution}", job_type="preprocess-data") as run:
+    file_path = "src/data/1810.04805v2.pdf"
+    loader = PyPDFLoader(file_path)
+    docs = loader.load()
+    print(len(docs))
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    splits = text_splitter.split_documents(docs)
+    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings,persist_directory="src/rag_db")
+    # 📦 save the vector database to the artifact
+    vectorstore_artifact = wandb.Artifact(
+        "vector-database", type="dataset", description="Vector Database for RAG model",
+        metadata={"source": file_path,
+                  "sizes": len(docs),
+                  "embedding": model_name,
+                  "chunks": len(splits),
+                  "destined_for": "rag-model"})
+    vectorstore_artifact.add_dir("src/rag_db")
+    run.log_artifact(vectorstore_artifact)