From 9aee81a30a78b94df1c58c68f60f65a2932d2e2a Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Mon, 14 Apr 2025 12:47:20 -0400
Subject: [PATCH 01/64] docker deployment

---
 .../workflows/backend-docker-image-build.yml  | 20 +++++++++++++++++++
 airflow/dags/dataflow/chunk_data.py           |  3 ++-
 airflow/dags/dataflow/scraper.py              |  3 ++-
 airflow/dags/dataflow/store_data.py           |  3 ++-
 prefectWorkflows/dataflow/chunk_data.py       |  3 ++-
 prefectWorkflows/dataflow/scraper.py          |  3 ++-
 prefectWorkflows/dataflow/store_data.py       |  3 ++-
 services/backend/Dockerfile                   |  3 ++-
 services/backend/main.py                      |  6 +++---
 services/backend/src/dataflow/chunk_data.py   | 15 +++++++++++++-
 services/backend/src/dataflow/scraper.py      | 16 ++++++++++++++-
 services/backend/src/dataflow/store_data.py   | 14 ++++++++++++-
 12 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index f26e491..5f5897e 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -31,3 +31,23 @@ jobs:
           docker build -t $IMAGE .
           docker push $IMAGE
           cd ../..
+      - name: gcp deploy
+        run: |
+          - name: gcp deploy
+  run: |
+    gcloud run deploy backend-service \
+      --source services/backend \
+      --region ${{ secrets.GCP_REGION }} \
+      --platform managed \
+      --allow-unauthenticated \
+      --set-env-vars \
+      AIRFLOW_UID=5000,\
+      BASE_URL='https://www.khoury.northeastern.edu/',\
+      MAX_DEPTH=3,\
+      CONCURRENT_REQUESTS=10,\
+      DATA_FOLDER="scraped_data",\
+      MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\
+      MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\
+      BUCKET_NAME=${{ secrets.BUCKET_NAME }},\
+      RAW_DATA_FOLDER=raw_data,\
+      FAISS_INDEX_FOLDER=faiss_index
diff --git a/airflow/dags/dataflow/chunk_data.py b/airflow/dags/dataflow/chunk_data.py
index fc730d4..30f3ab4 100644
--- a/airflow/dags/dataflow/chunk_data.py
+++ b/airflow/dags/dataflow/chunk_data.py
@@ -11,7 +11,8 @@
 from dataflow.store_data import upload_faiss_index_to_bucket
 load_dotenv(override=True)
 BUCKET_NAME= os.getenv('BUCKET_NAME')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+credentials, project = default()
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 def chunk_data():
     # Load all JSON files from a directory
diff --git a/airflow/dags/dataflow/scraper.py b/airflow/dags/dataflow/scraper.py
index 9a0e5f9..d426ece 100644
--- a/airflow/dags/dataflow/scraper.py
+++ b/airflow/dags/dataflow/scraper.py
@@ -13,7 +13,8 @@
 BASE_URL = os.getenv('BASE_URL')
 MAX_DEPTH = int(os.getenv('MAX_DEPTH'))             # Maximum recursion depth (base URL is depth 0)
 CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS'))  # Maximum number of concurrent requests
-GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ')
+from google.auth import default
+credentials, project = default()
 # Create folder for JSON data
 DATA_FOLDER = "scraped_data"
 if not os.path.exists(DATA_FOLDER):
diff --git a/airflow/dags/dataflow/store_data.py b/airflow/dags/dataflow/store_data.py
index d142be3..95ffa68 100644
--- a/airflow/dags/dataflow/store_data.py
+++ b/airflow/dags/dataflow/store_data.py
@@ -5,7 +5,8 @@
 BUCKET_NAME= os.getenv('BUCKET_NAME')
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+credentials, project = default()
 
 def get_blob_from_bucket():
     storage_client = Client()
diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py
index 643dcba..aa83b8a 100644
--- a/prefectWorkflows/dataflow/chunk_data.py
+++ b/prefectWorkflows/dataflow/chunk_data.py
@@ -11,7 +11,8 @@
 from dataflow.store_data import upload_faiss_index_to_bucket
 load_dotenv(override=True)
 BUCKET_NAME= os.getenv('BUCKET_NAME')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+credentials, project = default()
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 def chunk_data():
     # Load all JSON files from a directory
diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py
index 9a0e5f9..d426ece 100644
--- a/prefectWorkflows/dataflow/scraper.py
+++ b/prefectWorkflows/dataflow/scraper.py
@@ -13,7 +13,8 @@
 BASE_URL = os.getenv('BASE_URL')
 MAX_DEPTH = int(os.getenv('MAX_DEPTH'))             # Maximum recursion depth (base URL is depth 0)
 CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS'))  # Maximum number of concurrent requests
-GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ')
+from google.auth import default
+credentials, project = default()
 # Create folder for JSON data
 DATA_FOLDER = "scraped_data"
 if not os.path.exists(DATA_FOLDER):
diff --git a/prefectWorkflows/dataflow/store_data.py b/prefectWorkflows/dataflow/store_data.py
index 039dc22..de6cc16 100644
--- a/prefectWorkflows/dataflow/store_data.py
+++ b/prefectWorkflows/dataflow/store_data.py
@@ -5,7 +5,8 @@
 BUCKET_NAME= os.getenv('BUCKET_NAME')
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+credentials, project = default()
 
 def get_blob_from_bucket():
     storage_client = Client()
diff --git a/services/backend/Dockerfile b/services/backend/Dockerfile
index b6a91f8..a91ae59 100644
--- a/services/backend/Dockerfile
+++ b/services/backend/Dockerfile
@@ -16,7 +16,8 @@ WORKDIR /app
 
 # Pre-copy requirements separately for Docker cache efficiency
 COPY runtime-requirements.txt .
-
+RUN pip install --upgrade pip
+RUN pip install "huggingface_hub[hf_xet]"
 # Install Python dependencies
 RUN pip install --no-cache-dir -r runtime-requirements.txt
 
diff --git a/services/backend/main.py b/services/backend/main.py
index 54f6212..056dc8c 100644
--- a/services/backend/main.py
+++ b/services/backend/main.py
@@ -49,6 +49,6 @@ def post(self):
            
 
 if __name__=="__main__":
-    PORT=os.getenv('PORT')
-    HOST=os.getenv('HOST')
-    app.run(host=HOST,port=PORT,debug=True)
\ No newline at end of file
+    PORT=os.getenv('PORT', 8080)
+
+    app.run(host='0.0.0.0',port=PORT,debug=True)
\ No newline at end of file
diff --git a/services/backend/src/dataflow/chunk_data.py b/services/backend/src/dataflow/chunk_data.py
index 7a43414..265ce1a 100644
--- a/services/backend/src/dataflow/chunk_data.py
+++ b/services/backend/src/dataflow/chunk_data.py
@@ -11,7 +11,20 @@
 from store_data import upload_faiss_index_to_bucket
 load_dotenv(override=True)
 BUCKET_NAME= os.getenv('BUCKET_NAME')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+from google.oauth2 import service_account
+
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 def chunk_data():
     # Load all JSON files from a directory
diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py
index 69cf26b..f5a99c4 100644
--- a/services/backend/src/dataflow/scraper.py
+++ b/services/backend/src/dataflow/scraper.py
@@ -13,7 +13,21 @@
 BASE_URL = os.getenv('BASE_URL')
 MAX_DEPTH = int(os.getenv('MAX_DEPTH'))             # Maximum recursion depth (base URL is depth 0)
 CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS'))  # Maximum number of concurrent requests
-GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ')
+
+from google.auth import default
+from google.oauth2 import service_account
+
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 # Create folder for JSON data
 DATA_FOLDER = "scraped_data"
 if not os.path.exists(DATA_FOLDER):
diff --git a/services/backend/src/dataflow/store_data.py b/services/backend/src/dataflow/store_data.py
index d142be3..837e630 100644
--- a/services/backend/src/dataflow/store_data.py
+++ b/services/backend/src/dataflow/store_data.py
@@ -5,8 +5,20 @@
 BUCKET_NAME= os.getenv('BUCKET_NAME')
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
-GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+from google.auth import default
+from google.oauth2 import service_account
 
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 def get_blob_from_bucket():
     storage_client = Client()
     bucket = storage_client.bucket(BUCKET_NAME)

From e5571ae5138644cb92a6b6f9779e1da2c3ed3ca1 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Mon, 14 Apr 2025 15:47:43 -0400
Subject: [PATCH 02/64] update workflow alignment

---
 .../workflows/backend-docker-image-build.yml  | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 5f5897e..62b06ac 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -31,23 +31,22 @@ jobs:
           docker build -t $IMAGE .
           docker push $IMAGE
           cd ../..
-      - name: gcp deploy
+
+      - name: Deploy to Cloud Run
         run: |
-          - name: gcp deploy
-  run: |
-    gcloud run deploy backend-service \
-      --source services/backend \
-      --region ${{ secrets.GCP_REGION }} \
-      --platform managed \
-      --allow-unauthenticated \
-      --set-env-vars \
-      AIRFLOW_UID=5000,\
-      BASE_URL='https://www.khoury.northeastern.edu/',\
-      MAX_DEPTH=3,\
-      CONCURRENT_REQUESTS=10,\
-      DATA_FOLDER="scraped_data",\
-      MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\
-      MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\
-      BUCKET_NAME=${{ secrets.BUCKET_NAME }},\
-      RAW_DATA_FOLDER=raw_data,\
-      FAISS_INDEX_FOLDER=faiss_index
+          gcloud run deploy backend-service \
+            --source services/backend \
+            --region ${{ secrets.GCP_REGION }} \
+            --platform managed \
+            --allow-unauthenticated \
+            --set-env-vars \
+              AIRFLOW_UID=5000,\
+              BASE_URL='https://www.khoury.northeastern.edu/',\
+              MAX_DEPTH=3,\
+              CONCURRENT_REQUESTS=10,\
+              DATA_FOLDER="scraped_data",\
+              MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\
+              MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\
+              BUCKET_NAME=${{ secrets.BUCKET_NAME }},\
+              RAW_DATA_FOLDER=raw_data,\
+              FAISS_INDEX_FOLDER=faiss_index

From 0a676d49066caf6da674409c9639eafbfe1ec94b Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Mon, 14 Apr 2025 15:59:04 -0400
Subject: [PATCH 03/64] update envs to single ,s

---
 .../workflows/backend-docker-image-build.yml  | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 62b06ac..995dc93 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -32,21 +32,12 @@ jobs:
           docker push $IMAGE
           cd ../..
 
-      - name: Deploy to Cloud Run
-        run: |
-          gcloud run deploy backend-service \
-            --source services/backend \
-            --region ${{ secrets.GCP_REGION }} \
-            --platform managed \
-            --allow-unauthenticated \
-            --set-env-vars \
-              AIRFLOW_UID=5000,\
-              BASE_URL='https://www.khoury.northeastern.edu/',\
-              MAX_DEPTH=3,\
-              CONCURRENT_REQUESTS=10,\
-              DATA_FOLDER="scraped_data",\
-              MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\
-              MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\
-              BUCKET_NAME=${{ secrets.BUCKET_NAME }},\
-              RAW_DATA_FOLDER=raw_data,\
-              FAISS_INDEX_FOLDER=faiss_index
+          - name: Deploy to Cloud Run
+          run: |
+            gcloud run deploy backend-service \
+              --source services/backend \
+              --region ${{ secrets.GCP_REGION }} \
+              --platform managed \
+              --allow-unauthenticated \
+              --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"
+        

From c3fc67d8822631d39bec06e9573b084717088d66 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Mon, 14 Apr 2025 16:00:56 -0400
Subject: [PATCH 04/64] divide the steps

---
 .github/workflows/backend-docker-image-build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 995dc93..91210d5 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -32,9 +32,9 @@ jobs:
           docker push $IMAGE
           cd ../..
 
-          - name: Deploy to Cloud Run
-          run: |
-            gcloud run deploy backend-service \
+      - name: Deploy to Cloud Run
+        run: |
+          gcloud run deploy backend-service \
               --source services/backend \
               --region ${{ secrets.GCP_REGION }} \
               --platform managed \

From 45a2ded715b240a571c302693c1228bb366b7b75 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Tue, 15 Apr 2025 15:14:35 -0400
Subject: [PATCH 05/64] update workflows

---
 .github/workflows/backend-docker-image-build.yml  | 13 +++++++------
 .github/workflows/frontend-docker-image-build.yml | 11 +++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 91210d5..edbc057 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -35,9 +35,10 @@ jobs:
       - name: Deploy to Cloud Run
         run: |
           gcloud run deploy backend-service \
-              --source services/backend \
-              --region ${{ secrets.GCP_REGION }} \
-              --platform managed \
-              --allow-unauthenticated \
-              --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"
-        
+        --source services/backend \
+        --region ${{ secrets.GCP_REGION }} \
+        --platform managed \
+        --allow-unauthenticated \
+        --memory 4Gi \
+        --timeout 3600s \
+        --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"
diff --git a/.github/workflows/frontend-docker-image-build.yml b/.github/workflows/frontend-docker-image-build.yml
index 537cd3c..b547d18 100644
--- a/.github/workflows/frontend-docker-image-build.yml
+++ b/.github/workflows/frontend-docker-image-build.yml
@@ -31,3 +31,14 @@ jobs:
           docker build -t $IMAGE .
           docker push $IMAGE
           cd ../..
+
+      - name: Deploy to Cloud Run
+        run: |
+          gcloud run deploy frontend-service \
+            --source services/frontend \
+            --region ${{ secrets.GCP_REGION }} \
+            --platform managed \
+            --allow-unauthenticated \
+            --memory 1Gi \
+            --timeout 1800s \
+            --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot"

From d86bb0498705232385c312ed7f8708957249c18f Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Tue, 15 Apr 2025 15:17:31 -0400
Subject: [PATCH 06/64] update alignment

---
 .github/workflows/backend-docker-image-build.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index edbc057..5ece471 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -35,10 +35,10 @@ jobs:
       - name: Deploy to Cloud Run
         run: |
           gcloud run deploy backend-service \
-        --source services/backend \
-        --region ${{ secrets.GCP_REGION }} \
-        --platform managed \
-        --allow-unauthenticated \
-        --memory 4Gi \
-        --timeout 3600s \
-        --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"
+            --source services/backend \
+            --region ${{ secrets.GCP_REGION }} \
+            --platform managed \
+            --allow-unauthenticated \
+            --memory 4Gi \
+            --timeout 3600s \
+            --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"

From 909e2dcc4c00d12484c414a3348b8501b5cc079b Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:36:30 -0400
Subject: [PATCH 07/64] add docker ignore to front end

---
 services/frontend/.dockerignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 services/frontend/.dockerignore

diff --git a/services/frontend/.dockerignore b/services/frontend/.dockerignore
new file mode 100644
index 0000000..4f509e5
--- /dev/null
+++ b/services/frontend/.dockerignore
@@ -0,0 +1 @@
+*.env
\ No newline at end of file

From 99d0ea8accfed6cda160311b150689182e418c2a Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Tue, 15 Apr 2025 19:06:55 -0400
Subject: [PATCH 08/64] update condition for workflows

---
 .github/workflows/backend-docker-image-build.yml  | 3 ++-
 .github/workflows/frontend-docker-image-build.yml | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 5ece471..4cb163c 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -4,7 +4,8 @@ on:
   push:
     branches:
       - "**"
-
+    paths:
+      - 'services/backend/**'
 jobs:
   backend_build:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/frontend-docker-image-build.yml b/.github/workflows/frontend-docker-image-build.yml
index b547d18..abd2475 100644
--- a/.github/workflows/frontend-docker-image-build.yml
+++ b/.github/workflows/frontend-docker-image-build.yml
@@ -4,6 +4,8 @@ on:
   push:
     branches:
       - "**"
+    paths:
+      - 'services/frontend/**'
 
 jobs:
   frontend_build:
@@ -41,4 +43,4 @@ jobs:
             --allow-unauthenticated \
             --memory 1Gi \
             --timeout 1800s \
-            --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot"
+            --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot/"

From f32fef7cd231b584f84aac113501c61000824044 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Tue, 15 Apr 2025 23:14:35 -0400
Subject: [PATCH 09/64] update mlflow

---
 services/backend/.dockerignore             |  6 +-
 services/backend/src/dataflow/rag_model.py | 37 +++++------
 services/frontend/.env                     |  2 +-
 services/frontend/app.py                   | 76 ++++++++++++++--------
 4 files changed, 71 insertions(+), 50 deletions(-)

diff --git a/services/backend/.dockerignore b/services/backend/.dockerignore
index 58329ad..c618940 100644
--- a/services/backend/.dockerignore
+++ b/services/backend/.dockerignore
@@ -1,3 +1,7 @@
 *.env
 __pycache__/
-scraped_data/
\ No newline at end of file
+scraped_data/
+mlruns/
+mlflow.db/
+mlartifacts/
+logs/
\ No newline at end of file
diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index fe05420..5d25ac6 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -27,38 +27,27 @@
 # Where you currently have this line:
 mlflow.set_experiment("rag_experiment")
 def get_or_create_experiment(experiment_name):
-
-    # Check if experiment exists
     try:
         experiment = mlflow.get_experiment_by_name(experiment_name)
-        
         if experiment is not None:
-            # Check if experiment is active (not deleted)
             if experiment.lifecycle_stage == "active":
-                print(f"Found active experiment '{experiment_name}' with ID: {experiment.experiment_id}")
+                print(f"✅ Found experiment: {experiment.experiment_id}")
                 return experiment.experiment_id
             else:
-                # Experiment exists but is deleted, create a new one with timestamp
-                new_name = f"{experiment_name}_{int(time.time())}"
-                experiment_id = mlflow.create_experiment(new_name)
-                print(f"Original experiment was deleted. Created new experiment '{new_name}' with ID: {experiment_id}")
-                return experiment_id
-        else:
-            # Create new experiment
-            experiment_id = mlflow.create_experiment(experiment_name)
-            print(f"Created new experiment '{experiment_name}' with ID: {experiment_id}")
-            return experiment_id
-    except Exception as e:
-        print(f"Error getting or creating experiment: {e}")
-        # Fallback - create a new experiment with timestamp
-        new_name = f"{experiment_name}_{int(time.time())}"
-        experiment_id = mlflow.create_experiment(new_name)
-        print(f"Created fallback experiment '{new_name}' with ID: {experiment_id}")
+                print(f"⚠️ Experiment exists but is deleted. Recreating...")
+        # Create a new experiment (either not found or was deleted)
+        experiment_id = mlflow.create_experiment(experiment_name)
+        print(f"🆕 Created experiment '{experiment_name}' with ID: {experiment_id}")
         return experiment_id
+    except Exception as e:
+        print(f"🚨 Exception during experiment creation: {e}")
+        return None
 
 # Replace it with:
 experiment_id = get_or_create_experiment("rag_experiment")
-mlflow.set_experiment_tag("description", "RAG pipeline with Mistral AI model")
+if not experiment_id:
+    raise ValueError("❌ Could not get or create a valid experiment ID. Aborting.")
+mlflow.set_tag("description", "RAG pipeline with Mistral AI model")
 if not os.environ.get("MISTRAL_API_KEY"):
   os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")
 
@@ -186,6 +175,10 @@ async def checkModel_fairness():
 
     query=input("generate query")
     response=generateResponse(query)
+    print("MLflow URI:", mlflow.get_tracking_uri())
+    print("Using experiment ID:", experiment_id)
+    print("Experiments available:", mlflow.search_experiments())
+
     print(response)
     #uncomment and enter prompts for model fairness and there is a limitation on api key
     # asyncio.run(checkModel_fairness())
\ No newline at end of file
diff --git a/services/frontend/.env b/services/frontend/.env
index 6148832..7a2998f 100644
--- a/services/frontend/.env
+++ b/services/frontend/.env
@@ -1 +1 @@
-API_URL="http://localhost:5002"
\ No newline at end of file
+API_URL="https://backend-service-640053216184.us-east1.run.app/NuBot/"
\ No newline at end of file
diff --git a/services/frontend/app.py b/services/frontend/app.py
index 03a793d..52e392e 100644
--- a/services/frontend/app.py
+++ b/services/frontend/app.py
@@ -1,30 +1,54 @@
 import streamlit as st
 import requests
-from dotenv import load_dotenv
 import os
+from dotenv import load_dotenv
+
 load_dotenv(override=True)
-# Define the backend API URL
-API_URL = os.getenv('API_URL')  # Update if running on a different host
-
-st.title("NuBot Chat Interface")
-st.markdown("### Ask NuBot any question!")
-
-# User input
-query = st.text_input("Enter your query:")
-
-if st.button("Submit"):
-    if query:
-        try:
-            # Send request to the backend API
-            response = requests.post(API_URL, json={"query": query})
-            
-            # Display the response
-            if response.status_code == 200:
-                st.success("Response from NuBot:")
-                st.write(response.json())
-            else:
-                st.error(f"Error {response.status_code}: {response.text}")
-        except requests.exceptions.RequestException as e:
-            st.error(f"Request failed: {e}")
-    else:
-        st.warning("Please enter a query before submitting.")
+
+# Load API URL from environment variable
+API_URL = os.getenv("API_URL", "http://localhost:8000/api/chat")
+
+# Streamlit app config
+st.set_page_config(page_title="NU Bot", page_icon="🤖", layout="centered")
+st.title("🤖 NU Bot")
+st.markdown("### Smart Chatbot for Northeastern University")
+
+# Input field for user questions
+user_input = st.text_input("Ask NU Bot a question:", "")
+
+# Display response
+if user_input:
+    try:
+        response = requests.post(API_URL, json={"query": user_input},verify=False)
+        # print(response.json())
+        if response.status_code == 200:
+            answer = response.json().get("answer", "No answer provided.")
+        else:
+            answer = f"Error: {response.status_code} - {response.text}"
+    except Exception as e:
+        answer = f"An error occurred: {e}"
+
+    st.markdown(f"**NU Bot says:** {answer}")
+
+# Technologies
+with st.expander("🔧 Technologies Used"):
+    st.markdown("""
+    - Google Cloud Platform (GCP)
+    - Mistral AI
+    - Python
+    - GitHub
+    """)
+
+# Features
+with st.expander("💡 Features"):
+    st.markdown("""
+    - Interactive chatbot interface
+    - Smart Q&A using university web data
+    - Real-time responses
+    - Scalable architecture
+    """)
+
+# Contact
+with st.expander("📬 Contact Us"):
+    st.markdown("Email us at [nubot@northeastern.edu](mailto:nubot@northeastern.edu)")
+

From 8c9aa36b4488e05538374e3f38a5800ac728dec8 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Tue, 15 Apr 2025 23:33:38 -0400
Subject: [PATCH 10/64] revert frontend

---
 services/frontend/app.py | 76 ++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 50 deletions(-)

diff --git a/services/frontend/app.py b/services/frontend/app.py
index 52e392e..03a793d 100644
--- a/services/frontend/app.py
+++ b/services/frontend/app.py
@@ -1,54 +1,30 @@
 import streamlit as st
 import requests
-import os
 from dotenv import load_dotenv
-
+import os
 load_dotenv(override=True)
-
-# Load API URL from environment variable
-API_URL = os.getenv("API_URL", "http://localhost:8000/api/chat")
-
-# Streamlit app config
-st.set_page_config(page_title="NU Bot", page_icon="🤖", layout="centered")
-st.title("🤖 NU Bot")
-st.markdown("### Smart Chatbot for Northeastern University")
-
-# Input field for user questions
-user_input = st.text_input("Ask NU Bot a question:", "")
-
-# Display response
-if user_input:
-    try:
-        response = requests.post(API_URL, json={"query": user_input},verify=False)
-        # print(response.json())
-        if response.status_code == 200:
-            answer = response.json().get("answer", "No answer provided.")
-        else:
-            answer = f"Error: {response.status_code} - {response.text}"
-    except Exception as e:
-        answer = f"An error occurred: {e}"
-
-    st.markdown(f"**NU Bot says:** {answer}")
-
-# Technologies
-with st.expander("🔧 Technologies Used"):
-    st.markdown("""
-    - Google Cloud Platform (GCP)
-    - Mistral AI
-    - Python
-    - GitHub
-    """)
-
-# Features
-with st.expander("💡 Features"):
-    st.markdown("""
-    - Interactive chatbot interface
-    - Smart Q&A using university web data
-    - Real-time responses
-    - Scalable architecture
-    """)
-
-# Contact
-with st.expander("📬 Contact Us"):
-    st.markdown("Email us at [nubot@northeastern.edu](mailto:nubot@northeastern.edu)")
-
+# Define the backend API URL
+API_URL = os.getenv('API_URL')  # Update if running on a different host
+
+st.title("NuBot Chat Interface")
+st.markdown("### Ask NuBot any question!")
+
+# User input
+query = st.text_input("Enter your query:")
+
+if st.button("Submit"):
+    if query:
+        try:
+            # Send request to the backend API
+            response = requests.post(API_URL, json={"query": query})
+            
+            # Display the response
+            if response.status_code == 200:
+                st.success("Response from NuBot:")
+                st.write(response.json())
+            else:
+                st.error(f"Error {response.status_code}: {response.text}")
+        except requests.exceptions.RequestException as e:
+            st.error(f"Request failed: {e}")
+    else:
+        st.warning("Please enter a query before submitting.")

From 0baa07da6c9756c48b7363183054476a592ae22b Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:41:44 -0400
Subject: [PATCH 11/64] check mlflow issue

---
 services/backend/src/dataflow/rag_model.py | 38 +++++++++++++++-------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index 5d25ac6..7b20b6b 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -25,7 +25,7 @@
 FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
 mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)  # Remote MLflow Server
 # Where you currently have this line:
-mlflow.set_experiment("rag_experiment")
+
 def get_or_create_experiment(experiment_name):
     try:
         experiment = mlflow.get_experiment_by_name(experiment_name)
@@ -45,8 +45,15 @@ def get_or_create_experiment(experiment_name):
 
 # Replace it with:
 experiment_id = get_or_create_experiment("rag_experiment")
-if not experiment_id:
-    raise ValueError("❌ Could not get or create a valid experiment ID. Aborting.")
+def ensure_experiment(name):
+    try:
+        mlflow.set_experiment(name)
+    except RestException:
+        mlflow.create_experiment(name)
+        mlflow.set_experiment(name)
+
+ensure_experiment("rag_experiment")
+# mlflow.set_experiment("rag_experiment")
 mlflow.set_tag("description", "RAG pipeline with Mistral AI model")
 if not os.environ.get("MISTRAL_API_KEY"):
   os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")
@@ -60,13 +67,20 @@ def get_llm():
 def get_prompt():
 # Define prompt for question-answering
     # Your prompt template
-    template = """Use the following pieces of context to answer the question at the end.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Use three sentences maximum and keep the answer as concise as possible.
-Always say "thanks for asking!" at the end of the answer.
+    template = """You are an expert assistant helping to answer questions based only on the given context.
+
+Instructions:
+- Use ONLY the context below to answer.
+- If the context does not contain the answer, say: "I don't know based on the available information."
+- Answer in 2-3 sentences, clearly and factually.
+- End your response with: "Thanks for asking!"
+
+Context:
 {context}
+
 Question: {question}
-Helpful Answer:"""
+
+Answer:"""
     custom_rag_prompt = PromptTemplate.from_template(template)
     return custom_rag_prompt
 
@@ -102,9 +116,9 @@ def load_embeddings():
 vector_store = FAISS.load_local(FAISS_INDEX_FOLDER, embeddings, allow_dangerous_deserialization=True)
 # Define application steps
 def retrieve(state: State):
-    with mlflow.start_run(nested=True, run_name="retrieval",experiment_id=experiment_id):
+    with mlflow.start_run(nested=True, run_name="retrieval"):
         start_time = time.time()
-        retrieved_docs = vector_store.similarity_search(state["question"])
+        retrieved_docs = vector_store.similarity_search(state["question"],k=10)
         retrieval_time = time.time() - start_time
     
         # Extract only metadata
@@ -123,7 +137,7 @@ def retrieve(state: State):
 # Initialize prompt once and store in a global variable
 prompt = get_prompt()
 def generate(state: State):
-    with mlflow.start_run(nested=True, run_name="generation",experiment_id=experiment_id):
+    with mlflow.start_run(nested=True, run_name="generation"):
         start_time = time.time()
         docs_content = "\n\n".join(doc.page_content for doc in state["context"])
         token_count = len(docs_content.split()) 
@@ -150,7 +164,7 @@ def generate(state: State):
 def generateResponse(query):
 # Compile application and test
     try:
-         with mlflow.start_run(run_name="RAG_Pipeline",experiment_id=experiment_id):
+         with mlflow.start_run(run_name="RAG_Pipeline"):
             mlflow.log_param("query", query)
             graph_builder = StateGraph(State).add_sequence([retrieve, generate])
             graph_builder.add_edge(START, "retrieve")

From 26f42966f820d9628be1e43320523cc7109503b8 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:47:53 -0400
Subject: [PATCH 12/64] update file safe name for scraper files

---
 services/backend/src/dataflow/scraper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py
index f5a99c4..32c696f 100644
--- a/services/backend/src/dataflow/scraper.py
+++ b/services/backend/src/dataflow/scraper.py
@@ -38,7 +38,8 @@ def safe_filename(url):
     parsed = urlparse(url)
     path = parsed.path.strip('/') or 'index'
     filename = re.sub(r'[^A-Za-z0-9_\-]', '_', path) + ".json"
-    return os.path.join(DATA_FOLDER, filename)
+    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+    return os.path.join(DATA_FOLDER, f"{filename}_{url_hash}.json")
 
 async def fetch(session, url, semaphore):
     """Fetch the content of the URL asynchronously."""

From e5766cb3f229a7655ea6f8850ba8e50185bc5343 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:48:32 -0400
Subject: [PATCH 13/64] add package

---
 services/backend/src/dataflow/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py
index 32c696f..b73f4ab 100644
--- a/services/backend/src/dataflow/scraper.py
+++ b/services/backend/src/dataflow/scraper.py
@@ -6,7 +6,7 @@
 import re
 from urllib.parse import urljoin, urlparse
 from dotenv import load_dotenv
-
+import hashlib
 from store_data import upload_many_blobs_with_transfer_manager
 load_dotenv(override=True)
 # Configuration

From 6b774c5564470105892bf653e4a14b06934127df Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:14:10 -0400
Subject: [PATCH 14/64] try mlflow fix of id tracking

---
 services/backend/src/dataflow/rag_model.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index 7b20b6b..b3d0833 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -44,17 +44,18 @@ def get_or_create_experiment(experiment_name):
         return None
 
 # Replace it with:
-experiment_id = get_or_create_experiment("rag_experiment")
+
 def ensure_experiment(name):
     try:
         mlflow.set_experiment(name)
-    except RestException:
+    except Exception as e:
         mlflow.create_experiment(name)
         mlflow.set_experiment(name)
 
 ensure_experiment("rag_experiment")
 # mlflow.set_experiment("rag_experiment")
 mlflow.set_tag("description", "RAG pipeline with Mistral AI model")
+# experiment_id = get_or_create_experiment("rag_experiment")
 if not os.environ.get("MISTRAL_API_KEY"):
   os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")
 

From 06734317aff120286f8264f0659057fa250df6a9 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Wed, 16 Apr 2025 20:59:58 -0400
Subject: [PATCH 15/64] second revision of mlflow

---
 services/backend/src/dataflow/rag_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index b3d0833..23327a8 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -165,6 +165,7 @@ def generate(state: State):
 def generateResponse(query):
 # Compile application and test
     try:
+        ensure_experiment("rag_experiment")
          with mlflow.start_run(run_name="RAG_Pipeline"):
             mlflow.log_param("query", query)
             graph_builder = StateGraph(State).add_sequence([retrieve, generate])
@@ -189,6 +190,7 @@ async def checkModel_fairness():
 if __name__ == "__main__":
 
     query=input("generate query")
+    ensure_experiment("rag_experiment")
     response=generateResponse(query)
     print("MLflow URI:", mlflow.get_tracking_uri())
     print("Using experiment ID:", experiment_id)

From f0e2fabd078a1506a3e93e3ac9ebfd1cb0531a25 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Thu, 17 Apr 2025 00:30:57 -0400
Subject: [PATCH 16/64] fix alignment

---
 services/backend/src/dataflow/rag_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index 23327a8..49c3581 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -166,7 +166,7 @@ def generateResponse(query):
 # Compile application and test
     try:
         ensure_experiment("rag_experiment")
-         with mlflow.start_run(run_name="RAG_Pipeline"):
+        with mlflow.start_run(run_name="RAG_Pipeline"):
             mlflow.log_param("query", query)
             graph_builder = StateGraph(State).add_sequence([retrieve, generate])
             graph_builder.add_edge(START, "retrieve")

From 2c9b53479f514378b5e74fba8f709d4a1c90cefc Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 18:15:36 -0400
Subject: [PATCH 17/64] add multiple urls scraping code

---
 services/backend/.env                       |  1 +
 services/backend/runtime-requirements.txt   |  1 +
 services/backend/src/dataflow/rag_model.py  |  5 +++--
 services/backend/src/dataflow/scraper.py    | 25 ++++++++++++++-------
 services/backend/src/dataflow/store_data.py |  2 +-
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/services/backend/.env b/services/backend/.env
index 7cc284f..60515d6 100644
--- a/services/backend/.env
+++ b/services/backend/.env
@@ -5,6 +5,7 @@ CONCURRENT_REQUESTS = 10
 DATA_FOLDER = "scraped_data"
 MISTRAL_API_KEY="1UTJndGP95gDBZopQkDojiZ5bCzSJG9p"
 MLFLOW_TRACKING_URI="http://localhost:5000"
+URLS_LIST= "https://www.khoury.northeastern.edu/"
 PORT=8080
 HOST=127.0.0.1
 BUCKET_NAME=scraped_raw_data_nubot
diff --git a/services/backend/runtime-requirements.txt b/services/backend/runtime-requirements.txt
index 4ff0497..953f6d0 100644
--- a/services/backend/runtime-requirements.txt
+++ b/services/backend/runtime-requirements.txt
@@ -13,6 +13,7 @@ langchain-community
 langgraph
 google-cloud-storage
 datasets
+gcsfs
 beautifulsoup4
 aiohttp
 flask-cors
diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py
index 49c3581..e6a2fb9 100644
--- a/services/backend/src/dataflow/rag_model.py
+++ b/services/backend/src/dataflow/rag_model.py
@@ -71,8 +71,9 @@ def get_prompt():
     template = """You are an expert assistant helping to answer questions based only on the given context.
 
 Instructions:
-- Use ONLY the context below to answer.
-- If the context does not contain the answer, say: "I don't know based on the available information."
+- Use the context and search in  below to answer.
+- you can search https://www.khoury.northeastern.edu/ for answering better if not found any in context
+- If context and the website provided  does not contain the answer, say: "I don't know based on the available information."
 - Answer in 2-3 sentences, clearly and factually.
 - End your response with: "Thanks for asking!"
 
diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py
index b73f4ab..0003587 100644
--- a/services/backend/src/dataflow/scraper.py
+++ b/services/backend/src/dataflow/scraper.py
@@ -10,7 +10,9 @@
 from store_data import upload_many_blobs_with_transfer_manager
 load_dotenv(override=True)
 # Configuration
-BASE_URL = os.getenv('BASE_URL')
+URLS_LIST=list(os.getenv('URLS_LIST','').split(","))
+
+# BASE_URL ="" #URLS_LIST[0]#os.getenv('BASE_URL')
 MAX_DEPTH = int(os.getenv('MAX_DEPTH'))             # Maximum recursion depth (base URL is depth 0)
 CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS'))  # Maximum number of concurrent requests
 
@@ -54,7 +56,7 @@ async def fetch(session, url, semaphore):
         print(f"Error fetching {url}: {e}")
         return None
 
-async def async_scrape(url, depth=0, session=None, semaphore=None):
+async def async_scrape(url,BASE_URL, depth=0, session=None, semaphore=None):
     """Recursively scrape pages asynchronously and store in JSON format."""
     if depth > MAX_DEPTH:
         return
@@ -96,26 +98,33 @@ async def async_scrape(url, depth=0, session=None, semaphore=None):
         next_url = urljoin(url, link['href'])
         if urlparse(next_url).netloc == urlparse(BASE_URL).netloc:
             next_url = next_url.split('#')[0]  # Remove fragments
-            tasks.append(async_scrape(next_url, depth + 1, session, semaphore))
+            tasks.append(async_scrape(next_url,BASE_URL, depth + 1, session, semaphore))
 
     if tasks:
         await asyncio.gather(*tasks)
     
 
-async def scrape_and_load():
+async def scrape_and_load(CURRENT_URl):
     """Main function to initiate scraping."""
     semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
     
     async with aiohttp.ClientSession() as session:
-        await async_scrape(BASE_URL, depth=0, session=session, semaphore=semaphore)
+        await async_scrape(CURRENT_URl,BASE_URL=CURRENT_URl, depth=0, session=session, semaphore=semaphore)
     
 
 def scrape_and_load_task():
-    asyncio.run(scrape_and_load())
+    for url in URLS_LIST:
+        BASE_URL=url
+        asyncio.run(scrape_and_load(BASE_URL))
+        print("*"*15)
+        print(f"scraping {url} done")
+        print("*"*15)
+
     upload_many_blobs_with_transfer_manager()
     return
 
 
 if __name__ == '__main__':
-    asyncio.run(scrape_and_load())
-    upload_many_blobs_with_transfer_manager()
\ No newline at end of file
+    scrape_and_load_task()
+    # asyncio.run(scrape_and_load())
+    # upload_many_blobs_with_transfer_manager()
\ No newline at end of file
diff --git a/services/backend/src/dataflow/store_data.py b/services/backend/src/dataflow/store_data.py
index 837e630..e89ad85 100644
--- a/services/backend/src/dataflow/store_data.py
+++ b/services/backend/src/dataflow/store_data.py
@@ -68,7 +68,7 @@ def upload_many_blobs_with_transfer_manager(
 
     storage_client = Client()
     bucket = storage_client.bucket(BUCKET_NAME)
-    source_directory=os.path.join("..","..","scraped_data")
+    source_directory=os.path.join("scraped_data")
     filenames = [f for f in os.listdir(source_directory) if f.endswith(".json")]
     for filename in filenames:
         file_path = os.path.join(source_directory, filename)

From 0e95868b93d0ee74cf0647e1708d6f9305499ff4 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Fri, 18 Apr 2025 18:16:36 -0400
Subject: [PATCH 18/64] Update backend-docker-image-build.yml

---
 .github/workflows/backend-docker-image-build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml
index 4cb163c..54e8976 100644
--- a/.github/workflows/backend-docker-image-build.yml
+++ b/.github/workflows/backend-docker-image-build.yml
@@ -42,4 +42,4 @@ jobs:
             --allow-unauthenticated \
             --memory 4Gi \
             --timeout 3600s \
-            --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index"
+            --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/""

From d4640831c2ae7e68216838461588c04d9db9d9ec Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:40:33 -0400
Subject: [PATCH 19/64] update prefect deployment flow

---
 .github/workflows/prefect_orchestraiton.yml   |  56 +++++++
 prefectWorkflows/.dockerignore                |   3 +
 prefectWorkflows/.env                         |   4 +-
 prefectWorkflows/Dockerfile                   |  21 +++
 prefectWorkflows/dataflow/chunk_data.py       |  18 ++-
 prefectWorkflows/dataflow/rag_model.py        | 149 ------------------
 prefectWorkflows/dataflow/scraper.py          |  47 ++++--
 prefectWorkflows/dataflow/store_data.py       |  15 +-
 prefectWorkflows/{scraper_flow.py => flow.py} |   4 +
 prefectWorkflows/requirements.txt             |  17 ++
 10 files changed, 167 insertions(+), 167 deletions(-)
 create mode 100644 .github/workflows/prefect_orchestraiton.yml
 create mode 100644 prefectWorkflows/.dockerignore
 create mode 100644 prefectWorkflows/Dockerfile
 delete mode 100644 prefectWorkflows/dataflow/rag_model.py
 rename prefectWorkflows/{scraper_flow.py => flow.py} (85%)
 create mode 100644 prefectWorkflows/requirements.txt

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
new file mode 100644
index 0000000..6db5d9e
--- /dev/null
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -0,0 +1,56 @@
+name: Deploy Prefect Flow to Cloud Run
+
+on:
+  push:
+    branches:
+      - "**" # run on commits to main (adjust branch name as needed)
+  workflow_dispatch: # allow manual trigger from the Actions tab if needed
+
+jobs:
+  build-deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Step 1: Check out repository code
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      # Step 2: Set up gcloud CLI
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+        with:
+          credentials_json: "${{ secrets.GCP_KEY }}"
+
+      # Step 3: Configure Docker auth for Artifact Registry
+      - name: Docker login for Artifact Registry
+        run: |
+          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
+
+      # Step 4: Build the Docker image
+      - name: Build Docker image
+        run: |
+          docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest .
+
+      # Step 5: Push the image to Artifact Registry
+      - name: Push Docker image
+        run: |
+          docker push ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
+
+      # Step 6: Install Prefect in the runner (for CLI use)
+      - name: Install Prefect
+        run: pip install --no-cache prefect==2.*
+
+      # Step 7: Register/Update Prefect deployment
+      - name: Register Prefect deployment
+        # Use Prefect CLI to build and apply the deployment
+        run: |
+          prefect deployment build flow.py:scraperflow -n "ScraperFlow Weekly" \
+              --pool my-cloud-run-pool -q default \
+              --cron "0 9 * * 6" --skip-upload --apply
+        env:
+          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+          PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}
+
+      # Step 8: (Optional) Confirm success
+      - name: Deployment successful
+        run: echo "Prefect flow deployment updated and scheduled successfully."
diff --git a/prefectWorkflows/.dockerignore b/prefectWorkflows/.dockerignore
new file mode 100644
index 0000000..3168a00
--- /dev/null
+++ b/prefectWorkflows/.dockerignore
@@ -0,0 +1,3 @@
+*.env
+scraped_data/
+faiss_index/
\ No newline at end of file
diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env
index 29ab14c..0b2b74e 100644
--- a/prefectWorkflows/.env
+++ b/prefectWorkflows/.env
@@ -6,4 +6,6 @@ DATA_FOLDER = "scraped_data"
 BUCKET_NAME=scraped_raw_data_nubot
 RAW_DATA_FOLDER=raw_data
 FAISS_INDEX_FOLDER=faiss_index
-GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json"
\ No newline at end of file
+GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json"
+PREFECT_API_KEY=pnu_VuQWNSlXmc2Hqknf
+PREFECT_API_URL="https://api.prefect.cloud/api/accounts-8a55-446a-ac46-80a3f843d8b6"
\ No newline at end of file
diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
new file mode 100644
index 0000000..5d3a8c7
--- /dev/null
+++ b/prefectWorkflows/Dockerfile
@@ -0,0 +1,21 @@
+# Use a lightweight Python base image
+FROM python:3.10-slim
+
+# Set working directory in container
+WORKDIR /opt/prefect/project
+
+# Copy requirements and install them (if you have a requirements.txt or pyproject.toml)
+COPY requirements.txt ./ 
+RUN pip install -U pip && pip install -r requirements.txt
+
+# Install Prefect (if not already included in requirements)
+RUN pip install prefect==2.*
+
+# Copy the Prefect flow code and related modules into the image
+COPY . .
+
+# (Optional) If your flow code is a package with setup.py, you could RUN pip install . instead.
+
+# Set the default command (entrypoint) for the container to do nothing by default.
+# Prefect will override this when running the flow, so we keep it simple.
+CMD ["python", "-c", "print('Prefect flow container ready')"]
diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py
index aa83b8a..265ce1a 100644
--- a/prefectWorkflows/dataflow/chunk_data.py
+++ b/prefectWorkflows/dataflow/chunk_data.py
@@ -8,11 +8,23 @@
 from dotenv import load_dotenv
 from google.cloud.storage import Client
 
-from dataflow.store_data import upload_faiss_index_to_bucket
+from store_data import upload_faiss_index_to_bucket
 load_dotenv(override=True)
 BUCKET_NAME= os.getenv('BUCKET_NAME')
 from google.auth import default
-credentials, project = default()
+from google.oauth2 import service_account
+
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 def chunk_data():
     # Load all JSON files from a directory
@@ -53,4 +65,4 @@ def chunk_data():
 
 if __name__=="__main__":
     chunk_data()
-    
+    upload_faiss_index_to_bucket()
\ No newline at end of file
diff --git a/prefectWorkflows/dataflow/rag_model.py b/prefectWorkflows/dataflow/rag_model.py
deleted file mode 100644
index ffcfa6c..0000000
--- a/prefectWorkflows/dataflow/rag_model.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from functools import lru_cache
-from langchain import hub
-from langchain_core.documents import Document
-from langgraph.graph import START, StateGraph
-from typing_extensions import List, TypedDict
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain.chat_models import init_chat_model
-from langchain_community.vectorstores import FAISS
-import getpass
-import os
-from dotenv import load_dotenv
-import mlflow
-import time
-from langfair.auto import AutoEval
-import asyncio
-# Load the FAISS index
-from google.cloud.storage import Client
-import tempfile
-import os
-load_dotenv(override=True)
-mlflow.langchain.autolog()
-MLFLOW_TRACKING_URI =os.environ.get("MLFLOW_TRACKING_URI")
-MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
-FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
-mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)  # Remote MLflow Server
-mlflow.set_experiment("rag_experiment")
-if not os.environ.get("MISTRAL_API_KEY"):
-  os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")
-
-@lru_cache(maxsize=None)
-def get_llm():
-    llm = init_chat_model("mistral-large-latest", model_provider="mistralai")
-    return llm
-
-@lru_cache(maxsize=None)
-def get_prompt():
-# Define prompt for question-answering
-    prompt = hub.pull("rlm/rag-prompt")
-    return prompt
-
-
-# Define state for application
-class State(TypedDict):
-    question: str
-    context: List[Document]
-    answer: str
-
-
-@lru_cache(maxsize=None)
-def load_embeddings():
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    return embeddings
-
-
-
-# Initialize GCS client
-storage_client = Client()
-bucket=storage_client.bucket(os.getenv('BUCKET_NAME'))
-embeddings=load_embeddings()
-if not os.path.exists(FAISS_INDEX_FOLDER):
-    os.makedirs(FAISS_INDEX_FOLDER, exist_ok=True)
-# Create a temporary directory
-# Download FAISS index files from bucket to FAISS_INDEX_FOLDER directory
-for blob in bucket.list_blobs(prefix=FAISS_INDEX_FOLDER):
-    # Extract just the filename from the full path
-    filename = os.path.basename(blob.name)
-    local_path = os.path.join(FAISS_INDEX_FOLDER, filename)
-    blob.download_to_filename(local_path)
-
-# Load FAISS index from directory
-vector_store = FAISS.load_local(FAISS_INDEX_FOLDER, embeddings, allow_dangerous_deserialization=True)
-# Define application steps
-def retrieve(state: State):
-    with mlflow.start_run(nested=True, run_name="retrieval"):
-        start_time = time.time()
-        retrieved_docs = vector_store.similarity_search(state["question"])
-        retrieval_time = time.time() - start_time
-    
-        # Extract only metadata
-        doc_metadata = [{"doc_id": doc.metadata.get("id", i), "source": doc.metadata.get("source", "unknown")}
-                        for i, doc in enumerate(retrieved_docs)]
-        
-        # Log metadata instead of full documents
-        mlflow.log_metric("retrieval_time", retrieval_time)
-        mlflow.log_param("retrieved_docs_count", len(retrieved_docs))
-        mlflow.log_dict(doc_metadata, "retrieved_docs.json")
-
-    return {"context": retrieved_docs}
-
-# Initialize LLM once and store in a global variable
-llm = get_llm()
-# Initialize prompt once and store in a global variable
-prompt = get_prompt()
-def generate(state: State):
-    with mlflow.start_run(nested=True, run_name="generation"):
-        start_time = time.time()
-        docs_content = "\n\n".join(doc.page_content for doc in state["context"])
-        token_count = len(docs_content.split()) 
-        # Use the global prompt instance
-        mlflow.log_param("retrieved_tokens", token_count)
-        mlflow.log_param("context_length", len(docs_content))
-        messages = prompt.invoke({"question": state["question"], "context": docs_content})
-        response = llm.invoke(messages)
-        generation_time = time.time() - start_time
-        
-        # Log LLM generation performance
-        mlflow.log_metric("generation_time", generation_time)
-        mlflow.log_param("response_length", len(response.content.split()))
-        mlflow.log_param("model_name", "mistral-large-latest")
-
-        # Save response
-        # with open("response.txt", "w") as f:
-        #     f.write(response.content)
-        # mlflow.log_artifact("response.txt")
-
-    return {"answer": response.content}
-
-
-def generateResponse(query):
-# Compile application and test
-    try:
-         with mlflow.start_run(run_name="RAG_Pipeline"):
-            mlflow.log_param("query", query)
-            graph_builder = StateGraph(State).add_sequence([retrieve, generate])
-            graph_builder.add_edge(START, "retrieve")
-            graph = graph_builder.compile()
-            response = graph.invoke({"question": f"{query}"})
-            mlflow.log_param("final_answer", response["answer"])
-            return response["answer"]
-    except Exception as e:
-        mlflow.log_param("error", str(e))
-        raise Exception(e)
-    
-async def checkModel_fairness():
-    auto_object = AutoEval(
-        prompts=["tell me about khoury"], 
-        langchain_llm=llm,
-        # toxicity_device=device # uncomment if GPU is available
-    )
-    results = await auto_object.evaluate()
-    print(results['metrics'])
-    
-if __name__ == "__main__":
-
-    query=input("generate query")
-    response=generateResponse(query)
-    print(response)
-    #uncomment and enter prompts for model fairness and there is a limitation on api key
-    # asyncio.run(checkModel_fairness())
\ No newline at end of file
diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py
index d426ece..0003587 100644
--- a/prefectWorkflows/dataflow/scraper.py
+++ b/prefectWorkflows/dataflow/scraper.py
@@ -6,15 +6,30 @@
 import re
 from urllib.parse import urljoin, urlparse
 from dotenv import load_dotenv
-
-from dataflow.store_data import upload_many_blobs_with_transfer_manager
+import hashlib
+from store_data import upload_many_blobs_with_transfer_manager
 load_dotenv(override=True)
 # Configuration
-BASE_URL = os.getenv('BASE_URL')
+URLS_LIST=list(os.getenv('URLS_LIST','').split(","))
+
+# BASE_URL ="" #URLS_LIST[0]#os.getenv('BASE_URL')
 MAX_DEPTH = int(os.getenv('MAX_DEPTH'))             # Maximum recursion depth (base URL is depth 0)
 CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS'))  # Maximum number of concurrent requests
+
 from google.auth import default
-credentials, project = default()
+from google.oauth2 import service_account
+
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 # Create folder for JSON data
 DATA_FOLDER = "scraped_data"
 if not os.path.exists(DATA_FOLDER):
@@ -25,7 +40,8 @@ def safe_filename(url):
     parsed = urlparse(url)
     path = parsed.path.strip('/') or 'index'
     filename = re.sub(r'[^A-Za-z0-9_\-]', '_', path) + ".json"
-    return os.path.join(DATA_FOLDER, filename)
+    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
+    return os.path.join(DATA_FOLDER, f"{filename}_{url_hash}.json")
 
 async def fetch(session, url, semaphore):
     """Fetch the content of the URL asynchronously."""
@@ -40,7 +56,7 @@ async def fetch(session, url, semaphore):
         print(f"Error fetching {url}: {e}")
         return None
 
-async def async_scrape(url, depth=0, session=None, semaphore=None):
+async def async_scrape(url,BASE_URL, depth=0, session=None, semaphore=None):
     """Recursively scrape pages asynchronously and store in JSON format."""
     if depth > MAX_DEPTH:
         return
@@ -82,26 +98,33 @@ async def async_scrape(url, depth=0, session=None, semaphore=None):
         next_url = urljoin(url, link['href'])
         if urlparse(next_url).netloc == urlparse(BASE_URL).netloc:
             next_url = next_url.split('#')[0]  # Remove fragments
-            tasks.append(async_scrape(next_url, depth + 1, session, semaphore))
+            tasks.append(async_scrape(next_url,BASE_URL, depth + 1, session, semaphore))
 
     if tasks:
         await asyncio.gather(*tasks)
     
 
-async def scrape_and_load():
+async def scrape_and_load(CURRENT_URl):
     """Main function to initiate scraping."""
     semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
     
     async with aiohttp.ClientSession() as session:
-        await async_scrape(BASE_URL, depth=0, session=session, semaphore=semaphore)
+        await async_scrape(CURRENT_URl,BASE_URL=CURRENT_URl, depth=0, session=session, semaphore=semaphore)
     
 
 def scrape_and_load_task():
-    asyncio.run(scrape_and_load())
+    for url in URLS_LIST:
+        BASE_URL=url
+        asyncio.run(scrape_and_load(BASE_URL))
+        print("*"*15)
+        print(f"scraping {url} done")
+        print("*"*15)
+
     upload_many_blobs_with_transfer_manager()
     return
 
 
 if __name__ == '__main__':
-    asyncio.run(scrape_and_load())
-    upload_many_blobs_with_transfer_manager()
\ No newline at end of file
+    scrape_and_load_task()
+    # asyncio.run(scrape_and_load())
+    # upload_many_blobs_with_transfer_manager()
\ No newline at end of file
diff --git a/prefectWorkflows/dataflow/store_data.py b/prefectWorkflows/dataflow/store_data.py
index de6cc16..e89ad85 100644
--- a/prefectWorkflows/dataflow/store_data.py
+++ b/prefectWorkflows/dataflow/store_data.py
@@ -6,8 +6,19 @@
 RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER')
 FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER')
 from google.auth import default
-credentials, project = default()
+from google.oauth2 import service_account
 
+# Try to get credentials - works in both Docker and Cloud Run
+try:
+    # First try Application Default Credentials (works in Cloud Run)
+    credentials, project = default()
+except Exception:
+    # Fall back to explicit credentials file (for Docker)
+    credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
+    if credentials_path:
+        credentials = service_account.Credentials.from_service_account_file(credentials_path)
+    else:
+        raise Exception("No credentials available")
 def get_blob_from_bucket():
     storage_client = Client()
     bucket = storage_client.bucket(BUCKET_NAME)
@@ -20,7 +31,7 @@ def get_blob_from_bucket():
 
 
 def upload_many_blobs_with_transfer_manager(
-    
+    workers=8
 ):
     """Upload every file in a list to a bucket, concurrently in a process pool.
 
diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/flow.py
similarity index 85%
rename from prefectWorkflows/scraper_flow.py
rename to prefectWorkflows/flow.py
index b82f9a8..2a7ec09 100644
--- a/prefectWorkflows/scraper_flow.py
+++ b/prefectWorkflows/flow.py
@@ -1,7 +1,11 @@
 from prefect import flow, task
 from dataflow.scraper import  scrape_and_load_task 
 from dataflow.chunk_data import chunk_data
+from dotenv import load_dotenv
+import os
+load_dotenv(override=True)
 
+PREFECT_API_KEY=os.getenv('PREFECT_API_KEY')
 
 @task
 def scrape_all_urls_task():
diff --git a/prefectWorkflows/requirements.txt b/prefectWorkflows/requirements.txt
new file mode 100644
index 0000000..c4a2008
--- /dev/null
+++ b/prefectWorkflows/requirements.txt
@@ -0,0 +1,17 @@
+python-dotenv
+requests
+transformers==4.48.0
+sentence-transformers
+torch
+faiss-cpu
+mlflow
+langchain[mistralai]
+langchain-community
+langgraph
+google-cloud-storage
+datasets
+gcsfs
+beautifulsoup4
+aiohttp
+langchain-huggingface
+langfair

From ae83ad5565418993c0a4d4a04e243d5f5f68d771 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:43:12 -0400
Subject: [PATCH 20/64] update path

---
 .github/workflows/prefect_orchestraiton.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 6db5d9e..14ade22 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -4,6 +4,8 @@ on:
   push:
     branches:
       - "**" # run on commits to main (adjust branch name as needed)
+    paths:
+      - "prefectWorkflows/**"
   workflow_dispatch: # allow manual trigger from the Actions tab if needed
 
 jobs:

From 459e854426a934ca1139fac4bcc5a8622563b8f8 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:44:01 -0400
Subject: [PATCH 21/64] update prefect workflow

---
 .github/workflows/prefect_orchestraiton.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 14ade22..d6d22ed 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -31,6 +31,7 @@ jobs:
       # Step 4: Build the Docker image
       - name: Build Docker image
         run: |
+          cd prefectWorkflows
           docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest .
 
       # Step 5: Push the image to Artifact Registry

From 29e359c88dce3be0f7d0f861a1d707d864886ccf Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:55:32 -0400
Subject: [PATCH 22/64] add tgcloud run deploy command

---
 .github/workflows/prefect_orchestraiton.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index d6d22ed..8a02dcd 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -57,3 +57,24 @@ jobs:
       # Step 8: (Optional) Confirm success
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."
+  deploy-worker:
+    needs: build-and-deploy-image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v1
+        with:
+          credentials_json: "${{ secrets.GCP_KEY }}"
+
+      - name: Deploy Prefect worker to Cloud Run
+        run: |
+          gcloud run deploy prefect-worker \
+            --image=prefecthq/prefect:3-latest \
+            --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \
+            --no-cpu-throttling \
+            --platform managed \
+            --allow-unauthenticated \
+            --memory 4Gi \
+            --timeout 3600s \
+            --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/",PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} "
+            --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run"

From 560f113455ca09b6af5030c14365abd3085fd809 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:56:33 -0400
Subject: [PATCH 23/64] temporarily disable path tracking

---
 .github/workflows/prefect_orchestraiton.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 8a02dcd..f0e0933 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -4,8 +4,8 @@ on:
   push:
     branches:
       - "**" # run on commits to main (adjust branch name as needed)
-    paths:
-      - "prefectWorkflows/**"
+    # paths:
+    #   - "prefectWorkflows/**"
   workflow_dispatch: # allow manual trigger from the Actions tab if needed
 
 jobs:

From 3274482574b8bf0fd5980625b8c6a2951426d673 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:57:25 -0400
Subject: [PATCH 24/64] update

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index f0e0933..c99162f 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -58,7 +58,7 @@ jobs:
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."
   deploy-worker:
-    needs: build-and-deploy-image
+    needs: build-and-deploy
     runs-on: ubuntu-latest
     steps:
       - name: Set up Google Cloud SDK

From cba0fa72b650ff253e46e0233930928f92e10d53 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Fri, 18 Apr 2025 23:59:05 -0400
Subject: [PATCH 25/64] update flow

---
 .github/workflows/prefect_orchestraiton.yml | 27 ++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index c99162f..2149da8 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -3,10 +3,8 @@ name: Deploy Prefect Flow to Cloud Run
 on:
   push:
     branches:
-      - "**" # run on commits to main (adjust branch name as needed)
-    # paths:
-    #   - "prefectWorkflows/**"
-  workflow_dispatch: # allow manual trigger from the Actions tab if needed
+      - "**" # run on commits to any branch (adjust as needed)
+  workflow_dispatch: # allow manual trigger from the Actions tab
 
 jobs:
   build-deploy:
@@ -32,7 +30,9 @@ jobs:
       - name: Build Docker image
         run: |
           cd prefectWorkflows
-          docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest .
+          docker build \
+            -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
+            .
 
       # Step 5: Push the image to Artifact Registry
       - name: Push Docker image
@@ -45,11 +45,14 @@ jobs:
 
       # Step 7: Register/Update Prefect deployment
       - name: Register Prefect deployment
-        # Use Prefect CLI to build and apply the deployment
         run: |
-          prefect deployment build flow.py:scraperflow -n "ScraperFlow Weekly" \
-              --pool my-cloud-run-pool -q default \
-              --cron "0 9 * * 6" --skip-upload --apply
+          prefect deployment build flow.py:scraperflow \
+            -n "ScraperFlow Weekly" \
+            --pool my-cloud-run-pool \
+            -q default \
+            --cron "0 9 * * 6" \
+            --skip-upload \
+            --apply
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
           PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}
@@ -57,9 +60,11 @@ jobs:
       # Step 8: (Optional) Confirm success
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."
+
   deploy-worker:
-    needs: build-and-deploy
+    needs: build-deploy
     runs-on: ubuntu-latest
+
     steps:
       - name: Set up Google Cloud SDK
         uses: google-github-actions/setup-gcloud@v1
@@ -76,5 +81,5 @@ jobs:
             --allow-unauthenticated \
             --memory 4Gi \
             --timeout 3600s \
-            --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/",PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} "
+            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \
             --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run"

From 330d92ebe7f4c25a3c762e25f2311fa177c47400 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sat, 19 Apr 2025 00:04:11 -0400
Subject: [PATCH 26/64] update credentials flow

---
 .github/workflows/prefect_orchestraiton.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 2149da8..92542e3 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -15,11 +15,13 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v3
 
-      # Step 2: Set up gcloud CLI
-      - name: Set up Google Cloud SDK
-        uses: google-github-actions/setup-gcloud@v1
+      - name: GCP Authentication
+        uses: google-github-actions/auth@v2
         with:
           credentials_json: "${{ secrets.GCP_KEY }}"
+      # Step 2: Set up gcloud CLI
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
 
       # Step 3: Configure Docker auth for Artifact Registry
       - name: Docker login for Artifact Registry
@@ -66,10 +68,12 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - name: Set up Google Cloud SDK
-        uses: google-github-actions/setup-gcloud@v1
+      - name: GCP Authentication
+        uses: google-github-actions/auth@v2
         with:
           credentials_json: "${{ secrets.GCP_KEY }}"
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
 
       - name: Deploy Prefect worker to Cloud Run
         run: |

From 879eec3d27739133ae892c20160c6f0c1afd584c Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sat, 19 Apr 2025 00:08:35 -0400
Subject: [PATCH 27/64] update image

---
 .github/workflows/prefect_orchestraiton.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 92542e3..01d62eb 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -33,13 +33,13 @@ jobs:
         run: |
           cd prefectWorkflows
           docker build \
-            -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
+            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
             .
 
       # Step 5: Push the image to Artifact Registry
       - name: Push Docker image
         run: |
-          docker push ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
+          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
 
       # Step 6: Install Prefect in the runner (for CLI use)
       - name: Install Prefect

From 8a8beb8d3232ac4a5fe33fd5ceb35de2e86cfeaf Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 09:27:23 -0400
Subject: [PATCH 28/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 01d62eb..2e49349 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -48,6 +48,7 @@ jobs:
       # Step 7: Register/Update Prefect deployment
       - name: Register Prefect deployment
         run: |
+          cd prefectWorkflows
           prefect deployment build flow.py:scraperflow \
             -n "ScraperFlow Weekly" \
             --pool my-cloud-run-pool \

From 0cfaf6c65c18f0016f8146a89b4c271f359cc9b2 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 12:06:20 -0400
Subject: [PATCH 29/64] update prefect .yaml

---
 prefectWorkflows/prefect.yaml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml
index ab669e5..c929c2c 100644
--- a/prefectWorkflows/prefect.yaml
+++ b/prefectWorkflows/prefect.yaml
@@ -13,11 +13,11 @@ build:
 push:
 
 # # pull section allows you to provide instructions for cloning this project in remote locations
-# pull:
-# - prefect.deployments.steps.git_clone:
-#     repository: https://github.com/Nikhil-Kudupudi/NUBot.git
-#     branch: gcs_bucket
-#     access_token:
+pull:
+- prefect.deployments.steps.git_clone:
+    repository: https://github.com/Nikhil-Kudupudi/NUBot.git
+    branch: docker-deployment
+    access_token:
 
 # the deployments section allows you to provide configuration for deploying flows
 deployments:
@@ -34,14 +34,14 @@ deployments:
     work_queue_name:
     job_variables: {}
 - name: default
-  version:
+  version: 1.0.0
   tags: []
   concurrency_limit:
-  description:
-  entrypoint: scraper_flow.py:scraperflow
+  description: "cloud run prefect flow"
+  entrypoint: prefectWorkflows/flow.py:scraperflow
   parameters: {}
   work_pool:
-    name: nubot_dataflow
-    work_queue_name:
+    name: my-cloud-run-pool
+    work_queue_name: default
     job_variables: {}
   schedules: []

From 88e18b5f93b86e1ebf070e7d0ee1ad10f3fc26ab Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 12:07:34 -0400
Subject: [PATCH 30/64] update the latest command

---
 .github/workflows/prefect_orchestraiton.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 2e49349..f2d36f8 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -49,13 +49,7 @@ jobs:
       - name: Register Prefect deployment
         run: |
           cd prefectWorkflows
-          prefect deployment build flow.py:scraperflow \
-            -n "ScraperFlow Weekly" \
-            --pool my-cloud-run-pool \
-            -q default \
-            --cron "0 9 * * 6" \
-            --skip-upload \
-            --apply
+          prefect deployment prefect.yaml --apply
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
           PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}

From 340e488544f93a0c58e8d590cc6feec8e4794f8a Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 12:36:53 -0400
Subject: [PATCH 31/64] update command

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index f2d36f8..1fc6af0 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -49,7 +49,7 @@ jobs:
       - name: Register Prefect deployment
         run: |
           cd prefectWorkflows
-          prefect deployment prefect.yaml --apply
+          prefect deploy --apply prefect.yaml 
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
           PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}

From f9ceeac67383296261b330e46717afd633719474 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 12:56:10 -0400
Subject: [PATCH 32/64] update command for prefect deploy

---
 .github/workflows/prefect_orchestraiton.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 1fc6af0..f52bfbc 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -49,7 +49,14 @@ jobs:
       - name: Register Prefect deployment
         run: |
           cd prefectWorkflows
-          prefect deploy --apply prefect.yaml 
+          prefect deploy flow.py:scraperflow \
+          --name "scraperflow-deployment" \
+          --description "ScraperFlow Deployment for scraping tasks" \
+          --tag scraper --tag production \
+          --cron "0 9 * * 6" \
+          --pool my-cloud-run-pool \
+          --work-queue default
+
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
           PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}

From 9c048d55dfc4873bc85ece4fbc1831e14a91a90b Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:06:49 -0400
Subject: [PATCH 33/64] create a new flow

---
 .github/workflows/prefect_orchestraiton.yml | 20 ++++-----
 prefectWorkflows/prefect.yaml               | 47 ---------------------
 2 files changed, 10 insertions(+), 57 deletions(-)
 delete mode 100644 prefectWorkflows/prefect.yaml

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index f52bfbc..53ee8af 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -29,17 +29,17 @@ jobs:
           gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
       # Step 4: Build the Docker image
-      - name: Build Docker image
-        run: |
-          cd prefectWorkflows
-          docker build \
-            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
-            .
+      # - name: Build Docker image
+      #   run: |
+      #     cd prefectWorkflows
+      #     docker build \
+      #       -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
+      #       .
 
-      # Step 5: Push the image to Artifact Registry
-      - name: Push Docker image
-        run: |
-          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
+      # # Step 5: Push the image to Artifact Registry
+      # - name: Push Docker image
+      #   run: |
+      #     docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
 
       # Step 6: Install Prefect in the runner (for CLI use)
       - name: Install Prefect
diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml
deleted file mode 100644
index c929c2c..0000000
--- a/prefectWorkflows/prefect.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Welcome to your prefect.yaml file! You can use this file for storing and managing
-# configuration for deploying your flows. We recommend committing this file to source
-# control along with your flow code.
-
-# Generic metadata about this project
-name: prefectWorkflows
-prefect-version: 3.2.15
-
-# build section allows you to manage and build docker images
-build:
-
-# push section allows you to manage if and how this project is uploaded to remote locations
-push:
-
-# # pull section allows you to provide instructions for cloning this project in remote locations
-pull:
-- prefect.deployments.steps.git_clone:
-    repository: https://github.com/Nikhil-Kudupudi/NUBot.git
-    branch: docker-deployment
-    access_token:
-
-# the deployments section allows you to provide configuration for deploying flows
-deployments:
-- name:
-  version:
-  tags: []
-  description:
-  schedule: {}
-  flow_name:
-  entrypoint:
-  parameters: {}
-  work_pool:
-    name:
-    work_queue_name:
-    job_variables: {}
-- name: default
-  version: 1.0.0
-  tags: []
-  concurrency_limit:
-  description: "cloud run prefect flow"
-  entrypoint: prefectWorkflows/flow.py:scraperflow
-  parameters: {}
-  work_pool:
-    name: my-cloud-run-pool
-    work_queue_name: default
-    job_variables: {}
-  schedules: []

From 203f0f5dba53fe576b88fe0aefff5dea4f99015a Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:21:20 -0400
Subject: [PATCH 34/64] update test changes

---
 .github/workflows/prefect_orchestraiton.yml   |  2 +-
 prefectWorkflows/prefect.yaml                 | 44 +++++++++++++++++++
 prefectWorkflows/{flow.py => scraper_flow.py} |  0
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 prefectWorkflows/prefect.yaml
 rename prefectWorkflows/{flow.py => scraper_flow.py} (100%)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 53ee8af..7dfff7b 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -49,7 +49,7 @@ jobs:
       - name: Register Prefect deployment
         run: |
           cd prefectWorkflows
-          prefect deploy flow.py:scraperflow \
+          prefect deploy scraper_flow.py:scraperflow \
           --name "scraperflow-deployment" \
           --description "ScraperFlow Deployment for scraping tasks" \
           --tag scraper --tag production \
diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml
new file mode 100644
index 0000000..a6117a8
--- /dev/null
+++ b/prefectWorkflows/prefect.yaml
@@ -0,0 +1,44 @@
+# Welcome to your prefect.yaml file! You can use this file for storing and managing
+# configuration for deploying your flows. We recommend committing this file to source
+# control along with your flow code.
+
+# Generic metadata about this project
+name: prefectWorkflows
+prefect-version: 3.2.15
+
+# build section allows you to manage and build docker images
+# You can leave this empty if not using Docker for deployment
+build: {}
+
+# push section allows you to manage if and how this project is uploaded to remote locations
+# Leave this empty if you don't need to push artifacts
+push: {}
+
+# pull section allows you to provide instructions for cloning this project in remote locations
+# Remove or uncomment and modify if you need this step for pulling code from GitHub or other places
+pull:
+  - prefect.deployments.steps.git_clone:
+      repository: https://github.com/Nikhil-Kudupudi/NUBot.git
+      branch: docker-deployment
+      access_token: <your-access-token>
+
+# Deployments section allows you to provide configuration for deploying flows
+deployments:
+  - name: "scraperflow-deployment"
+    version: "1.0.0"  # Define a version for your deployment
+    tags:
+      - "scraper"
+      - "production"  # Add relevant tags
+    description: "Deployment for ScraperFlow, scheduled weekly"
+    schedule:
+      cron: "0 9 * * 6"  # This sets the flow to run every Saturday at 9 AM
+    flow_name: scraperflow  # The name of your flow function
+    entrypoint: prefectWorkflows/scraper_flow.py:scraperflow  # Path to your flow function
+    parameters: {}  # If your flow takes parameters, add them here
+    work_pool:
+      name: nubot_dataflow  # Specify the name of your work pool
+      work_queue_name: default  # If you have a specific work queue
+      job_variables: {}  # Define any job variables (optional)
+
+  # Example for another deployment, you can add more as needed
+
diff --git a/prefectWorkflows/flow.py b/prefectWorkflows/scraper_flow.py
similarity index 100%
rename from prefectWorkflows/flow.py
rename to prefectWorkflows/scraper_flow.py

From 0b2ce8f22633d9527fe455ebad3ef4675445f803 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:55:26 -0400
Subject: [PATCH 35/64] update url

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 7dfff7b..577d2ba 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -58,7 +58,7 @@ jobs:
           --work-queue default
 
         env:
-          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+          PREFECT_API_KEY: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}}
           PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}
 
       # Step 8: (Optional) Confirm success

From 21860c3cc9b50eaaaed907b8f251cb804a4d3f95 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 13:57:39 -0400
Subject: [PATCH 36/64] swap keys

---
 .github/workflows/prefect_orchestraiton.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 577d2ba..a8a87ef 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -58,9 +58,8 @@ jobs:
           --work-queue default
 
         env:
-          PREFECT_API_KEY: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}}
-          PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }}
-
+          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+          PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}}
       # Step 8: (Optional) Confirm success
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."

From 845906f358292b1d275b6992b0084a346ba84632 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 18:20:59 -0400
Subject: [PATCH 37/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index a8a87ef..4a5f462 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -29,17 +29,17 @@ jobs:
           gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
       # Step 4: Build the Docker image
-      # - name: Build Docker image
-      #   run: |
-      #     cd prefectWorkflows
-      #     docker build \
-      #       -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
-      #       .
+      - name: Build Docker image
+        run: |
+          cd prefectWorkflows
+          docker build \
+            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
+            .
 
-      # # Step 5: Push the image to Artifact Registry
-      # - name: Push Docker image
-      #   run: |
-      #     docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
+      # Step 5: Push the image to Artifact Registry
+      - name: Push Docker image
+        run: |
+          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
 
       # Step 6: Install Prefect in the runner (for CLI use)
       - name: Install Prefect

From 4cea0fb0e2bcca5bc9483657f2b7b4c2cece21df Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 18:54:54 -0400
Subject: [PATCH 38/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 4a5f462..32bb857 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -82,6 +82,7 @@ jobs:
             --image=prefecthq/prefect:3-latest \
             --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \
             --no-cpu-throttling \
+            --region ${{ secrets.GCP_REGION }} \
             --platform managed \
             --allow-unauthenticated \
             --memory 4Gi \

From 1abb4ed670c0a7897a48e04cc1c670d44a75edb5 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 19:20:49 -0400
Subject: [PATCH 39/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 32bb857..12e59b4 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -80,7 +80,6 @@ jobs:
         run: |
           gcloud run deploy prefect-worker \
             --image=prefecthq/prefect:3-latest \
-            --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \
             --no-cpu-throttling \
             --region ${{ secrets.GCP_REGION }} \
             --platform managed \

From 3f707fae5b33adfd7b4cf0380b7bcd45cad915e2 Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sat, 19 Apr 2025 19:42:17 -0400
Subject: [PATCH 40/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 25 ++++++---------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 12e59b4..3dce490 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -33,13 +33,13 @@ jobs:
         run: |
           cd prefectWorkflows
           docker build \
-            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \
+            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
             .
 
       # Step 5: Push the image to Artifact Registry
       - name: Push Docker image
         run: |
-          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest
+          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
 
       # Step 6: Install Prefect in the runner (for CLI use)
       - name: Install Prefect
@@ -60,31 +60,20 @@ jobs:
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
           PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}}
-      # Step 8: (Optional) Confirm success
-      - name: Deployment successful
-        run: echo "Prefect flow deployment updated and scheduled successfully."
-
-  deploy-worker:
-    needs: build-deploy
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: GCP Authentication
-        uses: google-github-actions/auth@v2
-        with:
-          credentials_json: "${{ secrets.GCP_KEY }}"
-      - name: Set up Google Cloud SDK
-        uses: google-github-actions/setup-gcloud@v2
 
       - name: Deploy Prefect worker to Cloud Run
         run: |
           gcloud run deploy prefect-worker \
-            --image=prefecthq/prefect:3-latest \
+            --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
             --no-cpu-throttling \
             --region ${{ secrets.GCP_REGION }} \
             --platform managed \
+            --port 5002 \
             --allow-unauthenticated \
             --memory 4Gi \
             --timeout 3600s \
             --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \
             --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run"
+      # Step 8: (Optional) Confirm success
+      - name: Deployment successful
+        run: echo "Prefect flow deployment updated and scheduled successfully."

From 5da0cc9b7049aede6a704995cad5a22b67e16cff Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sun, 20 Apr 2025 01:28:49 -0400
Subject: [PATCH 41/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 3dce490..80a3bc4 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -72,6 +72,8 @@ jobs:
             --allow-unauthenticated \
             --memory 4Gi \
             --timeout 3600s \
+            --entrypoint=sh \
+            --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \
             --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \
             --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run"
       # Step 8: (Optional) Confirm success

From 6d74ddb80fe8934fd3e1f9580a3d9dc8eb774b9d Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 12:09:13 -0400
Subject: [PATCH 42/64] update flow

---
 .github/workflows/prefect_orchestraiton.yml | 14 ++++++++++++
 prefectWorkflows/Dockerfile                 | 25 ++++++---------------
 prefectWorkflows/scraper_flow.py            | 11 ++++++++-
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 80a3bc4..4ccb59a 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -79,3 +79,17 @@ jobs:
       # Step 8: (Optional) Confirm success
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."
+  deploy-flow:
+    needs: build-push
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install Prefect
+        run: pip install prefect==3.*
+
+      - name: Run deploy script
+        env:
+          PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}
+          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+        run: |
+          cd prefectWorkflows
+          python scraper_flow.py
diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index 5d3a8c7..f3c20ef 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -1,21 +1,10 @@
-# Use a lightweight Python base image
+# Dockerfile
 FROM python:3.10-slim
-
-# Set working directory in container
+ENV PYTHONUNBUFFERED=1
 WORKDIR /opt/prefect/project
-
-# Copy requirements and install them (if you have a requirements.txt or pyproject.toml)
-COPY requirements.txt ./ 
-RUN pip install -U pip && pip install -r requirements.txt
-
-# Install Prefect (if not already included in requirements)
-RUN pip install prefect==2.*
-
-# Copy the Prefect flow code and related modules into the image
+COPY requirements.txt .
+RUN pip install --upgrade pip \
+ && pip install --no-cache-dir -r requirements.txt \
+ && pip install --no-cache-dir prefect==3.*
 COPY . .
-
-# (Optional) If your flow code is a package with setup.py, you could RUN pip install . instead.
-
-# Set the default command (entrypoint) for the container to do nothing by default.
-# Prefect will override this when running the flow, so we keep it simple.
-CMD ["python", "-c", "print('Prefect flow container ready')"]
+CMD ["python", "-c", "print('ready')"]
diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/scraper_flow.py
index 2a7ec09..5430fc8 100644
--- a/prefectWorkflows/scraper_flow.py
+++ b/prefectWorkflows/scraper_flow.py
@@ -3,6 +3,7 @@
 from dataflow.chunk_data import chunk_data
 from dotenv import load_dotenv
 import os
+from prefect.docker import DockerImage
 load_dotenv(override=True)
 
 PREFECT_API_KEY=os.getenv('PREFECT_API_KEY')
@@ -30,7 +31,15 @@ def scraperflow():
     #                    push=True
     #                   )
     try:
-        scraperflow()
+        scraperflow.deploy(
+        name="scraperflow-deployment",
+        work_pool_name="my-cloud-run-pool",
+        image=DockerImage(
+            name="us-docker.pkg.dev/nubot-nikhil/backend-nubot/scraperflow:latest",
+            platform="linux/amd64",
+        ),
+        schedule="0 9 * * 6",
+    )
     except Exception as e:
         print(e)
 

From 197187c707d1fdfc1d9fbb6759559aeb60d1ba5a Mon Sep 17 00:00:00 2001
From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com>
Date: Sun, 20 Apr 2025 12:12:59 -0400
Subject: [PATCH 43/64] Update prefect_orchestraiton.yml

---
 .github/workflows/prefect_orchestraiton.yml | 38 ++++++++-------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 4ccb59a..f7233b3 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -3,15 +3,14 @@ name: Deploy Prefect Flow to Cloud Run
 on:
   push:
     branches:
-      - "**" # run on commits to any branch (adjust as needed)
-  workflow_dispatch: # allow manual trigger from the Actions tab
+      - "**"             # run on commits to any branch (adjust as needed)
+  workflow_dispatch:    # allow manual trigger from the Actions tab
 
 jobs:
   build-deploy:
     runs-on: ubuntu-latest
 
     steps:
-      # Step 1: Check out repository code
       - name: Checkout code
         uses: actions/checkout@v3
 
@@ -19,16 +18,14 @@ jobs:
         uses: google-github-actions/auth@v2
         with:
           credentials_json: "${{ secrets.GCP_KEY }}"
-      # Step 2: Set up gcloud CLI
+
       - name: Set up Google Cloud SDK
         uses: google-github-actions/setup-gcloud@v2
 
-      # Step 3: Configure Docker auth for Artifact Registry
       - name: Docker login for Artifact Registry
         run: |
           gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
-      # Step 4: Build the Docker image
       - name: Build Docker image
         run: |
           cd prefectWorkflows
@@ -36,30 +33,26 @@ jobs:
             -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
             .
 
-      # Step 5: Push the image to Artifact Registry
       - name: Push Docker image
         run: |
           docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
 
-      # Step 6: Install Prefect in the runner (for CLI use)
       - name: Install Prefect
         run: pip install --no-cache prefect==2.*
 
-      # Step 7: Register/Update Prefect deployment
       - name: Register Prefect deployment
         run: |
           cd prefectWorkflows
           prefect deploy scraper_flow.py:scraperflow \
-          --name "scraperflow-deployment" \
-          --description "ScraperFlow Deployment for scraping tasks" \
-          --tag scraper --tag production \
-          --cron "0 9 * * 6" \
-          --pool my-cloud-run-pool \
-          --work-queue default
-
+            --name "scraperflow-deployment" \
+            --description "ScraperFlow Deployment for scraping tasks" \
+            --tag scraper --tag production \
+            --cron "0 9 * * 6" \
+            --pool my-cloud-run-pool \
+            --work-queue default
         env:
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
-          PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}}
+          PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
 
       - name: Deploy Prefect worker to Cloud Run
         run: |
@@ -68,27 +61,26 @@ jobs:
             --no-cpu-throttling \
             --region ${{ secrets.GCP_REGION }} \
             --platform managed \
-            --port 5002 \
             --allow-unauthenticated \
             --memory 4Gi \
             --timeout 3600s \
             --entrypoint=sh \
             --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \
-            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \
-            --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run"
-      # Step 8: (Optional) Confirm success
+            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
       - name: Deployment successful
         run: echo "Prefect flow deployment updated and scheduled successfully."
+
   deploy-flow:
-    needs: build-push
+    needs: build-deploy
     runs-on: ubuntu-latest
+
     steps:
       - name: Install Prefect
         run: pip install prefect==3.*
 
       - name: Run deploy script
         env:
-          PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}
+          PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
           PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
         run: |
           cd prefectWorkflows

From bd0f1f26e6fb4601d238b0e806e0197af86c4b96 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 13:02:49 -0400
Subject: [PATCH 44/64] update

---
 .github/workflows/prefect_orchestraiton.yml | 106 ++++++++++----------
 prefectWorkflows/.env                       |   2 +-
 prefectWorkflows/dataflow/chunk_data.py     |   2 +-
 prefectWorkflows/dataflow/scraper.py        |   2 +-
 4 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index f7233b3..cc55b59 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -1,87 +1,87 @@
-name: Deploy Prefect Flow to Cloud Run
+name: CI/CD Prefect Flow & Worker to Cloud Run
 
 on:
   push:
     branches:
-      - "**"             # run on commits to any branch (adjust as needed)
-  workflow_dispatch:    # allow manual trigger from the Actions tab
+      - "**" # run on commits to any branch (adjust as needed)
+  workflow_dispatch: # allow manual trigger
 
 jobs:
-  build-deploy:
+  build-and-deploy:
     runs-on: ubuntu-latest
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: GCP Authentication
         uses: google-github-actions/auth@v2
         with:
-          credentials_json: "${{ secrets.GCP_KEY }}"
+          credentials_json: ${{ secrets.GCP_KEY }}
 
-      - name: Set up Google Cloud SDK
+      - name: Setup gcloud CLI
         uses: google-github-actions/setup-gcloud@v2
+        with:
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
 
-      - name: Docker login for Artifact Registry
-        run: |
-          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
+      - name: Docker login to Artifact Registry
+        run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
-      - name: Build Docker image
+      - name: Build & Push Docker image
         run: |
           cd prefectWorkflows
           docker build \
             -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
             .
-
-      - name: Push Docker image
-        run: |
           docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
 
-      - name: Install Prefect
-        run: pip install --no-cache prefect==2.*
+      - name: Setup Python & Prefect CLI
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: pip install prefect==3.*
 
-      - name: Register Prefect deployment
-        run: |
-          cd prefectWorkflows
-          prefect deploy scraper_flow.py:scraperflow \
-            --name "scraperflow-deployment" \
-            --description "ScraperFlow Deployment for scraping tasks" \
-            --tag scraper --tag production \
-            --cron "0 9 * * 6" \
-            --pool my-cloud-run-pool \
-            --work-queue default
-        env:
-          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
-          PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
+      - name: Prefect Auth & Deploy
+        uses: PrefectHQ/actions-prefect-auth@v1
+        with:
+          prefect-api-key: ${{ secrets.PREFECT_API_KEY }}
+          prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_ID }}
+      - uses: PrefectHQ/actions-prefect-deploy@v4
+        with:
+          deployment-names: scraperflow-deployment
+          requirements-file-paths: requirements.txt
+          deployment-file-path: prefect.yaml
 
-      - name: Deploy Prefect worker to Cloud Run
-        run: |
-          gcloud run deploy prefect-worker \
-            --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-            --no-cpu-throttling \
-            --region ${{ secrets.GCP_REGION }} \
-            --platform managed \
-            --allow-unauthenticated \
-            --memory 4Gi \
-            --timeout 3600s \
-            --entrypoint=sh \
-            --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \
-            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
       - name: Deployment successful
-        run: echo "Prefect flow deployment updated and scheduled successfully."
+        run: echo "Prefect flow deployed ✔️"
 
-  deploy-flow:
-    needs: build-deploy
+  deploy-worker:
+    needs: build-and-deploy
     runs-on: ubuntu-latest
 
     steps:
-      - name: Install Prefect
-        run: pip install prefect==3.*
+      - name: GCP Authentication
+        uses: google-github-actions/auth@v2
+        with:
+          credentials_json: ${{ secrets.GCP_KEY }}
 
-      - name: Run deploy script
-        env:
-          PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
-          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+      - name: Setup gcloud CLI
+        uses: google-github-actions/setup-gcloud@v2
+        with:
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+
+      - name: Deploy Prefect Worker to Cloud Run
         run: |
-          cd prefectWorkflows
-          python scraper_flow.py
+          gcloud run deploy prefect-worker \
+            --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
+            --platform managed \
+            --region ${{ secrets.GCP_REGION }} \
+            --allow-unauthenticated \
+            --no-cpu-throttling \
+            --min-instances=1 \
+            --timeout=3600s \
+            --memory=4Gi \
+            --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \
+            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
+      - name: Worker deployed
+        run: echo "Prefect worker deployed ✔️"
diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env
index 0b2b74e..2e9444b 100644
--- a/prefectWorkflows/.env
+++ b/prefectWorkflows/.env
@@ -8,4 +8,4 @@ RAW_DATA_FOLDER=raw_data
 FAISS_INDEX_FOLDER=faiss_index
 GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json"
 PREFECT_API_KEY=pnu_VuQWNSlXmc2Hqknf
-PREFECT_API_URL="https://api.prefect.cloud/api/accounts-8a55-446a-ac46-80a3f843d8b6"
\ No newline at end of file
+PREFECT_API_URL="https://api.prefect.cloud/api/accounts/806f2e07-5063-4fbe-9b46-0545ad5de2d1/workspaces/acdf9e9e-8a55-446a-ac46-80a3f843d8b6"
diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py
index 265ce1a..fc35e37 100644
--- a/prefectWorkflows/dataflow/chunk_data.py
+++ b/prefectWorkflows/dataflow/chunk_data.py
@@ -8,7 +8,7 @@
 from dotenv import load_dotenv
 from google.cloud.storage import Client
 
-from store_data import upload_faiss_index_to_bucket
+from dataflow.store_data import upload_faiss_index_to_bucket
 load_dotenv(override=True)
 BUCKET_NAME= os.getenv('BUCKET_NAME')
 from google.auth import default
diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py
index 0003587..4676a4a 100644
--- a/prefectWorkflows/dataflow/scraper.py
+++ b/prefectWorkflows/dataflow/scraper.py
@@ -7,7 +7,7 @@
 from urllib.parse import urljoin, urlparse
 from dotenv import load_dotenv
 import hashlib
-from store_data import upload_many_blobs_with_transfer_manager
+from dataflow.store_data import upload_many_blobs_with_transfer_manager
 load_dotenv(override=True)
 # Configuration
 URLS_LIST=list(os.getenv('URLS_LIST','').split(","))

From 82d458289113d217d320ec540ef776c2407d03b3 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 13:11:41 -0400
Subject: [PATCH 45/64] add prefect workspace name

---
 .github/workflows/prefect_orchestraiton.yml | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index cc55b59..c237d0a 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -19,21 +19,21 @@ jobs:
         with:
           credentials_json: ${{ secrets.GCP_KEY }}
 
-      - name: Setup gcloud CLI
-        uses: google-github-actions/setup-gcloud@v2
-        with:
-          project_id: ${{ secrets.GCP_PROJECT_ID }}
+      # - name: Setup gcloud CLI
+      #   uses: google-github-actions/setup-gcloud@v2
+      #   with:
+      #     project_id: ${{ secrets.GCP_PROJECT_ID }}
 
-      - name: Docker login to Artifact Registry
-        run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
+      # - name: Docker login to Artifact Registry
+      #   run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
-      - name: Build & Push Docker image
-        run: |
-          cd prefectWorkflows
-          docker build \
-            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-            .
-          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
+      # - name: Build & Push Docker image
+      #   run: |
+      #     cd prefectWorkflows
+      #     docker build \
+      #       -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
+      #       .
+      #     docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
 
       - name: Setup Python & Prefect CLI
         uses: actions/setup-python@v5
@@ -45,7 +45,7 @@ jobs:
         uses: PrefectHQ/actions-prefect-auth@v1
         with:
           prefect-api-key: ${{ secrets.PREFECT_API_KEY }}
-          prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_ID }}
+          prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_NAME }}
       - uses: PrefectHQ/actions-prefect-deploy@v4
         with:
           deployment-names: scraperflow-deployment

From 1d8ac748225e8e70ccc09a2a3f2c3d809a90f8d2 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 14:50:24 -0400
Subject: [PATCH 46/64] update test flow

---
 .github/workflows/github-orchestration.yml  |   0
 .github/workflows/prefect_orchestraiton.yml | 127 +++++++++++---------
 prefectWorkflows/Dockerfile                 |  36 ++++--
 prefectWorkflows/scraper_flow.py            |  18 +--
 4 files changed, 101 insertions(+), 80 deletions(-)
 create mode 100644 .github/workflows/github-orchestration.yml

diff --git a/.github/workflows/github-orchestration.yml b/.github/workflows/github-orchestration.yml
new file mode 100644
index 0000000..e69de29
diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index c237d0a..4af72d8 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -1,87 +1,98 @@
-name: CI/CD Prefect Flow & Worker to Cloud Run
+name: Deploy Prefect Flow to Cloud Run
 
 on:
   push:
     branches:
-      - "**" # run on commits to any branch (adjust as needed)
-  workflow_dispatch: # allow manual trigger
+      - main # or any branches you want to trigger the workflow
+  workflow_dispatch: # allows manual triggering
+
+env:
+  PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}
+  PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
 
 jobs:
   build-and-deploy:
     runs-on: ubuntu-latest
-
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install prefect==2.* python-dotenv
 
       - name: GCP Authentication
         uses: google-github-actions/auth@v2
         with:
-          credentials_json: ${{ secrets.GCP_KEY }}
-
-      # - name: Setup gcloud CLI
-      #   uses: google-github-actions/setup-gcloud@v2
-      #   with:
-      #     project_id: ${{ secrets.GCP_PROJECT_ID }}
+          credentials_json: "${{ secrets.GCP_KEY }}"
 
-      # - name: Docker login to Artifact Registry
-      #   run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
+      - name: Set up Google Cloud SDK
+        uses: google-github-actions/setup-gcloud@v2
 
-      # - name: Build & Push Docker image
-      #   run: |
-      #     cd prefectWorkflows
-      #     docker build \
-      #       -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-      #       .
-      #     docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
+      - name: Configure Docker for Artifact Registry
+        run: |
+          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
-      - name: Setup Python & Prefect CLI
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-      - run: pip install prefect==3.*
+      - name: Create Prefect Cloud Run pool if not exists
+        run: |
+          # Check if pool exists, create if it doesn't
+          if ! prefect pool ls | grep -q "my-cloud-run-pool"; then
+            prefect pool create my-cloud-run-pool --type cloud-run
+          fi
 
-      - name: Prefect Auth & Deploy
-        uses: PrefectHQ/actions-prefect-auth@v1
-        with:
-          prefect-api-key: ${{ secrets.PREFECT_API_KEY }}
-          prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_NAME }}
-      - uses: PrefectHQ/actions-prefect-deploy@v4
-        with:
-          deployment-names: scraperflow-deployment
-          requirements-file-paths: requirements.txt
-          deployment-file-path: prefect.yaml
+          # Check if work queue exists, create if it doesn't
+          if ! prefect work-queue ls | grep -q "default"; then
+            prefect work-queue create default --pool my-cloud-run-pool
+          fi
 
-      - name: Deployment successful
-        run: echo "Prefect flow deployed ✔️"
+      - name: Build Docker image
+        run: |
+          cd prefectWorkflows
+          docker build \
+            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
+            .
 
-  deploy-worker:
-    needs: build-and-deploy
-    runs-on: ubuntu-latest
+      - name: Push Docker image
+        run: |
+          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
 
-    steps:
-      - name: GCP Authentication
-        uses: google-github-actions/auth@v2
-        with:
-          credentials_json: ${{ secrets.GCP_KEY }}
+      - name: Verify Prefect Connectivity
+        run: |
+          prefect cloud login -k ${{secrets.PREFECT_API_KEY}}
+          prefect config set PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
+          prefect config set PREFECT_API_URL=${{ secrets.PREFECT_API_URL }}
+          prefect cloud workspace ls
 
-      - name: Setup gcloud CLI
-        uses: google-github-actions/setup-gcloud@v2
-        with:
-          project_id: ${{ secrets.GCP_PROJECT_ID }}
+      # If that succeeded, proceed with deployment
+      - name: Register Prefect deployment
+        run: |
+          cd prefectWorkflows
+          prefect deploy scraper_flow.py:scraperflow \
+            --name "scraperflow-deployment" \
+            --description "ScraperFlow Deployment for scraping tasks" \
+            --tag scraper --tag production \
+            --cron "0 9 * * 6" \
+            --pool my-cloud-run-pool \
+            --work-queue default
 
-      - name: Deploy Prefect Worker to Cloud Run
+      - name: Deploy Prefect worker to Cloud Run
         run: |
           gcloud run deploy prefect-worker \
             --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-            --platform managed \
             --region ${{ secrets.GCP_REGION }} \
+            --platform managed \
+            --port 8080 \
             --allow-unauthenticated \
-            --no-cpu-throttling \
-            --min-instances=1 \
-            --timeout=3600s \
-            --memory=4Gi \
-            --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \
+            --memory 4Gi \
+            --cpu 2 \
+            --timeout 3600s \
+            --concurrency 80 \
             --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
-      - name: Worker deployed
-        run: echo "Prefect worker deployed ✔️"
+      - name: Deployment successful
+        run: echo "Prefect flow deployment updated and scheduled successfully."
diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index f3c20ef..f33c503 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -1,10 +1,32 @@
-# Dockerfile
 FROM python:3.10-slim
-ENV PYTHONUNBUFFERED=1
-WORKDIR /opt/prefect/project
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements file
 COPY requirements.txt .
-RUN pip install --upgrade pip \
- && pip install --no-cache-dir -r requirements.txt \
- && pip install --no-cache-dir prefect==3.*
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the application code
 COPY . .
-CMD ["python", "-c", "print('ready')"]
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Default command to run the Prefect worker
+CMD ["prefect", "worker", "start", "--pool", "my-cloud-run-pool", "--type", "cloud-run"]
+
+# Expose the port that Cloud Run expects
+EXPOSE 8080
+
+# Health check endpoint for Cloud Run
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+  CMD curl -f http://localhost:8080/ || exit 1
\ No newline at end of file
diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/scraper_flow.py
index 5430fc8..c2d7602 100644
--- a/prefectWorkflows/scraper_flow.py
+++ b/prefectWorkflows/scraper_flow.py
@@ -24,22 +24,10 @@ def scraperflow():
 
 if __name__ == "__main__":
 # # Run the flow
-## for cloud
-    # scraperflow.deploy(name="my-first-deployment",
-    #                    work_pool_name="dataflow",
-    #                    image='prefecthq/prefect:2-python3.10',
-    #                    push=True
-    #                   )
+
     try:
-        scraperflow.deploy(
-        name="scraperflow-deployment",
-        work_pool_name="my-cloud-run-pool",
-        image=DockerImage(
-            name="us-docker.pkg.dev/nubot-nikhil/backend-nubot/scraperflow:latest",
-            platform="linux/amd64",
-        ),
-        schedule="0 9 * * 6",
-    )
+
+        scraperflow()
     except Exception as e:
         print(e)
 

From 771d9a5a6819b8f8a24a1bcc94627f581767ef87 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 14:56:19 -0400
Subject: [PATCH 47/64] test

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 4af72d8..a90a692 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -3,7 +3,7 @@ name: Deploy Prefect Flow to Cloud Run
 on:
   push:
     branches:
-      - main # or any branches you want to trigger the workflow
+      - "**" # or any branches you want to trigger the workflow
   workflow_dispatch: # allows manual triggering
 
 env:

From 5be490997ebea41ffeec31710db70cb9eac8c20e Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 15:32:49 -0400
Subject: [PATCH 48/64] update test 2

---
 .github/workflows/prefect_orchestraiton.yml | 120 ++++++++------------
 prefectWorkflows/Dockerfile                 |  40 +++----
 prefectWorkflows/prefect.yaml               |  56 +++------
 prefectWorkflows/scraper_flow.py            |  55 ++++-----
 4 files changed, 108 insertions(+), 163 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index a90a692..a322211 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -2,13 +2,7 @@ name: Deploy Prefect Flow to Cloud Run
 
 on:
   push:
-    branches:
-      - "**" # or any branches you want to trigger the workflow
-  workflow_dispatch: # allows manual triggering
-
-env:
-  PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}
-  PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+    branches: [main] # Trigger on push to main (adjust as needed)
 
 jobs:
   build-and-deploy:
@@ -17,82 +11,62 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v3
 
-      - name: Set up Python
-        uses: actions/setup-python@v4
+      # Authenticate to Google Cloud using the service account JSON key
+      - name: Set up gcloud CLI
+        uses: google-github-actions/setup-gcloud@v1
         with:
-          python-version: "3.10"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install prefect==2.* python-dotenv
-
-      - name: GCP Authentication
-        uses: google-github-actions/auth@v2
-        with:
-          credentials_json: "${{ secrets.GCP_KEY }}"
-
-      - name: Set up Google Cloud SDK
-        uses: google-github-actions/setup-gcloud@v2
+          service_account_key: ${{ secrets.GCP_KEY }}
+          project_id: ${{ secrets.GCP_PROJECT_ID }}
+          export_default_credentials: true
 
-      - name: Configure Docker for Artifact Registry
+      - name: Configure Docker auth for Artifact Registry
         run: |
-          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
-
-      - name: Create Prefect Cloud Run pool if not exists
-        run: |
-          # Check if pool exists, create if it doesn't
-          if ! prefect pool ls | grep -q "my-cloud-run-pool"; then
-            prefect pool create my-cloud-run-pool --type cloud-run
-          fi
-
-          # Check if work queue exists, create if it doesn't
-          if ! prefect work-queue ls | grep -q "default"; then
-            prefect work-queue create default --pool my-cloud-run-pool
-          fi
+          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev -q
+        # The above command logs Docker into Artifact Registry&#8203;:contentReference[oaicite:10]{index=10}
+        # using the gcloud credentials (no interactive prompt due to -q).
 
       - name: Build Docker image
         run: |
-          cd prefectWorkflows
-          docker build \
-            -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-            .
+          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/<YOUR_AR_REPOSITORY>/prefect-scraper:latest"
+          echo "Building image $IMAGE_URI"
+          docker build -t "$IMAGE_URI" -f prefectWorkflows/Dockerfile .
+          # Note: The context is the repository root (.), adjust path to Dockerfile if needed.
 
-      - name: Push Docker image
+      - name: Push Docker image to Artifact Registry
         run: |
-          docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest
+          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/<YOUR_AR_REPOSITORY>/prefect-scraper:latest"
+          docker push "$IMAGE_URI"
+        # After this step, the image is available in Artifact Registry for Cloud Run to use.
 
-      - name: Verify Prefect Connectivity
-        run: |
-          prefect cloud login -k ${{secrets.PREFECT_API_KEY}}
-          prefect config set PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
-          prefect config set PREFECT_API_URL=${{ secrets.PREFECT_API_URL }}
-          prefect cloud workspace ls
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install Prefect
+        run: pip install prefect==3.1.10
 
-      # If that succeeded, proceed with deployment
-      - name: Register Prefect deployment
+      - name: Authenticate with Prefect Cloud
+        env:
+          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
         run: |
-          cd prefectWorkflows
-          prefect deploy scraper_flow.py:scraperflow \
-            --name "scraperflow-deployment" \
-            --description "ScraperFlow Deployment for scraping tasks" \
-            --tag scraper --tag production \
-            --cron "0 9 * * 6" \
-            --pool my-cloud-run-pool \
-            --work-queue default
+          # Use Prefect CLI to log in to Prefect Cloud non-interactively
+          prefect cloud login -k $PREFECT_API_KEY -w "${{ secrets.PREFECT_WORKSPACE }}" || {
+            # Fallback: manually set API URL and API Key if the above doesn't work
+            echo "Using manual Prefect Cloud authentication..."
+            prefect config set PREFECT_API_URL="https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
+            prefect config set PREFECT_API_KEY="${{ secrets.PREFECT_API_KEY }}"
+          }
+        # This step authenticates the CLI with Prefect Cloud.
+        # We first attempt `prefect cloud login` (with API key and workspace) for convenience.
+        # If that fails (for example, if workspace name flag is not supported non-interactively),
+        # we fall back to setting PREFECT_API_URL and PREFECT_API_KEY directly&#8203;:contentReference[oaicite:11]{index=11}.
+        # The PREFECT_API_URL uses your Account ID and Workspace ID (from secrets)
+        # to target the correct workspace. This avoids "401 Unauthorized" due to wrong API URL.
 
-      - name: Deploy Prefect worker to Cloud Run
+      - name: Deploy Prefect flow
         run: |
-          gcloud run deploy prefect-worker \
-            --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \
-            --region ${{ secrets.GCP_REGION }} \
-            --platform managed \
-            --port 8080 \
-            --allow-unauthenticated \
-            --memory 4Gi \
-            --cpu 2 \
-            --timeout 3600s \
-            --concurrency 80 \
-            --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }}
-      - name: Deployment successful
-        run: echo "Prefect flow deployment updated and scheduled successfully."
+          cd prefectWorkflows   # navigate to the folder containing prefect.yaml
+          prefect deploy -n scraperflow-deployment
+        # The -n flag ensures we deploy the specific deployment by name (optional if only one deployment in YAML).
+        # This command reads prefect.yaml and registers/updates the deployment in Prefect Cloud.
diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index f33c503..ea2d944 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -1,32 +1,24 @@
+# Start from a lightweight Python image (use the appropriate Python version)
 FROM python:3.10-slim
 
+# Set working directory in container
 WORKDIR /app
 
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies.
+# If you have a requirements.txt, copy and install it:
+COPY requirements.txt . 
+RUN pip install -r requirements.txt
 
-# Copy requirements file
-COPY requirements.txt .
+# (Alternatively, directly install Prefect and any needed libraries)
+# RUN pip install prefect==3.1.10
 
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
+# Copy the Prefect flow code and the dataflow module into the image
+COPY prefectWorkflows/ /app/prefectWorkflows/
+COPY dataflow/ /app/dataflow/
 
-# Copy the application code
-COPY . .
+# Ensure Python can find the 'dataflow' module (add /app to PYTHONPATH)
+ENV PYTHONPATH="/app:${PYTHONPATH}"
 
-# Set environment variables
-ENV PYTHONUNBUFFERED=1
-
-# Default command to run the Prefect worker
-CMD ["prefect", "worker", "start", "--pool", "my-cloud-run-pool", "--type", "cloud-run"]
-
-# Expose the port that Cloud Run expects
-EXPOSE 8080
-
-# Health check endpoint for Cloud Run
-HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
-  CMD curl -f http://localhost:8080/ || exit 1
\ No newline at end of file
+# (Optional) Set a default command (Prefect Cloud will override this when submitting the flow run)
+# By default, do nothing or use a generic command. Prefect Cloud's work pool will specify the entrypoint at runtime.
+CMD ["python", "-c", "print('Container built for Prefect flow execution')"]
diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml
index a6117a8..70e2399 100644
--- a/prefectWorkflows/prefect.yaml
+++ b/prefectWorkflows/prefect.yaml
@@ -1,44 +1,20 @@
-# Welcome to your prefect.yaml file! You can use this file for storing and managing
-# configuration for deploying your flows. We recommend committing this file to source
-# control along with your flow code.
+# Prefect deployment configuration for the scraper_flow
+name: scraper-flow-project # Name of the project (can be any identifier for your reference)
+prefect-version: 3.1.10 # Prefect version to use for this deployment (match your Prefect 3.x version)
 
-# Generic metadata about this project
-name: prefectWorkflows
-prefect-version: 3.2.15
-
-# build section allows you to manage and build docker images
-# You can leave this empty if not using Docker for deployment
-build: {}
-
-# push section allows you to manage if and how this project is uploaded to remote locations
-# Leave this empty if you don't need to push artifacts
-push: {}
-
-# pull section allows you to provide instructions for cloning this project in remote locations
-# Remove or uncomment and modify if you need this step for pulling code from GitHub or other places
-pull:
-  - prefect.deployments.steps.git_clone:
-      repository: https://github.com/Nikhil-Kudupudi/NUBot.git
-      branch: docker-deployment
-      access_token: <your-access-token>
-
-# Deployments section allows you to provide configuration for deploying flows
 deployments:
-  - name: "scraperflow-deployment"
-    version: "1.0.0"  # Define a version for your deployment
-    tags:
-      - "scraper"
-      - "production"  # Add relevant tags
-    description: "Deployment for ScraperFlow, scheduled weekly"
+  - name: scraperflow-deployment # Name of this deployment (appears in Prefect UI)
+    description: "Scrapes all URLs and segments data every Saturday at 9:00 UTC"
+    entrypoint: scraper_flow.py:scraperflow # Entry point to the flow: "<script path>:<flow function>"
+    # Cron schedule for every Saturday at 9:00 AM
     schedule:
-      cron: "0 9 * * 6"  # This sets the flow to run every Saturday at 9 AM
-    flow_name: scraperflow  # The name of your flow function
-    entrypoint: prefectWorkflows/scraper_flow.py:scraperflow  # Path to your flow function
-    parameters: {}  # If your flow takes parameters, add them here
+      cron: "0 9 * * 6" # Cron expression for Saturday 09:00 (UTC)&#8203;:contentReference[oaicite:3]{index=3}
+      timezone: "UTC" # Timezone for the schedule (adjust if needed)
+    parameters: {} # Default parameters (empty since this flow has none)
     work_pool:
-      name: nubot_dataflow  # Specify the name of your work pool
-      work_queue_name: default  # If you have a specific work queue
-      job_variables: {}  # Define any job variables (optional)
-
-  # Example for another deployment, you can add more as needed
-
+      name: "my-cloud-run-pool" # Name of the push work pool for Cloud Run
+      work_queue_name: "default" # Work queue (use "default" or as configured in the pool)
+    tags: [] # (Optional) any tags for the deployment
+    # (Optional) infrastructure overrides can be specified if needed:
+    # infra_overrides:
+    #   image: "us-east1-docker.pkg.dev/<YOUR_GCP_PROJECT>/<YOUR_AR_REPOSITORY>/prefect-scraper:latest"
diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/scraper_flow.py
index c2d7602..7521eee 100644
--- a/prefectWorkflows/scraper_flow.py
+++ b/prefectWorkflows/scraper_flow.py
@@ -1,33 +1,36 @@
-from prefect import flow, task
-from dataflow.scraper import  scrape_and_load_task 
-from dataflow.chunk_data import chunk_data
-from dotenv import load_dotenv
-import os
-from prefect.docker import DockerImage
-load_dotenv(override=True)
-
-PREFECT_API_KEY=os.getenv('PREFECT_API_KEY')
+from prefect import flow, task, get_run_logger
+# Import the supporting functions from dataflow module
+from dataflow.scraping import scrape_and_load_task  # adjust import to actual module path
+from dataflow.processing import chunk_data
 
+# Define Prefect tasks
 @task
 def scrape_all_urls_task():
-    # If scrape_all_urls is an imported function, call it here and return the result
-    return scrape_and_load_task()  # or return the relevant data
+    """Task to scrape all URLs and load raw data."""
+    logger = get_run_logger()
+    logger.info("Starting scrape_all_urls_task...")
+    data = scrape_and_load_task()  # call the helper function to scrape and load data
+    logger.info(f"Scraped data: {len(data)} items.")
+    return data
+
 @task
-def dataSegmentation():
-    return chunk_data()
+def dataSegmentation(data):
+    """Task to segment the scraped data into chunks."""
+    logger = get_run_logger()
+    logger.info("Starting dataSegmentation task...")
+    segments = chunk_data(data)  # call helper to chunk the data
+    logger.info(f"Segmented data into {len(segments)} chunks.")
+    return segments
 
-@flow(log_prints=True)
-def scraperflow():
-    # Use the tasks within the flow
-    scrape_all_urls_task()
-    dataSegmentation()
+@flow
+def scraper_flow():
+    """Prefect flow to orchestrate scraping and data segmentation."""
+    # Run the scraping task and then pass its result into the segmentation task
+    raw_data = scrape_all_urls_task()
+    segmented = dataSegmentation(raw_data)
+    # (Optional) do something with segmented data, e.g., save or return
+    return "done"
 
 if __name__ == "__main__":
-# # Run the flow
-
-    try:
-
-        scraperflow()
-    except Exception as e:
-        print(e)
-
+    # This allows testing the flow locally by running this script
+    scraper_flow()

From 73026268c6a63a23580658df668a9d35ff3a3115 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 15:38:31 -0400
Subject: [PATCH 49/64] cha

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index a322211..45af96f 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -2,7 +2,7 @@ name: Deploy Prefect Flow to Cloud Run
 
 on:
   push:
-    branches: [main] # Trigger on push to main (adjust as needed)
+    branches: ["**"] # Trigger on push to main (adjust as needed)
 
 jobs:
   build-and-deploy:

From 37a98c9ac735032d1698f11354bed998c517fd07 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 15:44:33 -0400
Subject: [PATCH 50/64] df

---
 .github/workflows/prefect_orchestraiton.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 45af96f..b82ca3a 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -27,14 +27,14 @@ jobs:
 
       - name: Build Docker image
         run: |
-          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/<YOUR_AR_REPOSITORY>/prefect-scraper:latest"
+          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
           echo "Building image $IMAGE_URI"
           docker build -t "$IMAGE_URI" -f prefectWorkflows/Dockerfile .
           # Note: The context is the repository root (.), adjust path to Dockerfile if needed.
 
       - name: Push Docker image to Artifact Registry
         run: |
-          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/<YOUR_AR_REPOSITORY>/prefect-scraper:latest"
+          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
           docker push "$IMAGE_URI"
         # After this step, the image is available in Artifact Registry for Cloud Run to use.
 

From 68060747f5a17f14faf50740a2423dada42fcc91 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 15:46:51 -0400
Subject: [PATCH 51/64] she

---
 prefectWorkflows/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index ea2d944..1e6055f 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -14,7 +14,7 @@ RUN pip install -r requirements.txt
 
 # Copy the Prefect flow code and the dataflow module into the image
 COPY prefectWorkflows/ /app/prefectWorkflows/
-COPY dataflow/ /app/dataflow/
+
 
 # Ensure Python can find the 'dataflow' module (add /app to PYTHONPATH)
 ENV PYTHONPATH="/app:${PYTHONPATH}"

From df2b3feec63a7d6083c4b08528d26b5ccea07cf2 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:37:36 -0400
Subject: [PATCH 52/64] up

---
 prefectWorkflows/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index 1e6055f..aef5e81 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -7,7 +7,7 @@ WORKDIR /app
 # Install Python dependencies.
 # If you have a requirements.txt, copy and install it:
 COPY requirements.txt . 
-RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
 # (Alternatively, directly install Prefect and any needed libraries)
 # RUN pip install prefect==3.1.10

From 3e4a9af309328be0b5042afdb2f3780de90757fe Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:43:13 -0400
Subject: [PATCH 53/64] up

---
 prefectWorkflows/Dockerfile              | 1 +
 requirements.txt => run-requirements.txt | 0
 2 files changed, 1 insertion(+)
 rename requirements.txt => run-requirements.txt (100%)

diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index aef5e81..7af7649 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -7,6 +7,7 @@ WORKDIR /app
 # Install Python dependencies.
 # If you have a requirements.txt, copy and install it:
 COPY requirements.txt . 
+
 RUN pip install --no-cache-dir -r requirements.txt
 
 # (Alternatively, directly install Prefect and any needed libraries)
diff --git a/requirements.txt b/run-requirements.txt
similarity index 100%
rename from requirements.txt
rename to run-requirements.txt

From e62a933e094b98c3efa86ab53d978f67cb1e27f0 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:45:12 -0400
Subject: [PATCH 54/64] rfg

---
 .github/workflows/prefect_orchestraiton.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index b82ca3a..b0304db 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -27,9 +27,10 @@ jobs:
 
       - name: Build Docker image
         run: |
+          cd prfectWorkflows
           IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
           echo "Building image $IMAGE_URI"
-          docker build -t "$IMAGE_URI" -f prefectWorkflows/Dockerfile .
+          docker build -t "$IMAGE_URI" -f  .
           # Note: The context is the repository root (.), adjust path to Dockerfile if needed.
 
       - name: Push Docker image to Artifact Registry

From 95b2f5ed9fa821ed35669da28cefa267945a1070 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:46:08 -0400
Subject: [PATCH 55/64] upgfvfg

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index b0304db..e1c209d 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -27,7 +27,7 @@ jobs:
 
       - name: Build Docker image
         run: |
-          cd prfectWorkflows
+          cd prefectWorkflows
           IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
           echo "Building image $IMAGE_URI"
           docker build -t "$IMAGE_URI" -f  .

From c2a440b4456c8a4bf9eee408464eab11c1e0eaaf Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:47:43 -0400
Subject: [PATCH 56/64] sdfdf

---
 .github/workflows/prefect_orchestraiton.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index e1c209d..b11c43f 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -30,7 +30,7 @@ jobs:
           cd prefectWorkflows
           IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
           echo "Building image $IMAGE_URI"
-          docker build -t "$IMAGE_URI" -f  .
+          docker build -t "$IMAGE_URI"   .
           # Note: The context is the repository root (.), adjust path to Dockerfile if needed.
 
       - name: Push Docker image to Artifact Registry

From 763c444d64f0e70586e1d01a101b34f2276ecec2 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:48:46 -0400
Subject: [PATCH 57/64] fg

---
 prefectWorkflows/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile
index 7af7649..dfc7b42 100644
--- a/prefectWorkflows/Dockerfile
+++ b/prefectWorkflows/Dockerfile
@@ -14,7 +14,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 # RUN pip install prefect==3.1.10
 
 # Copy the Prefect flow code and the dataflow module into the image
-COPY prefectWorkflows/ /app/prefectWorkflows/
+COPY . .
 
 
 # Ensure Python can find the 'dataflow' module (add /app to PYTHONPATH)

From 216ed979bdbb873dcd8661cbc484f3d67771a2cd Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:53:44 -0400
Subject: [PATCH 58/64] cgh

---
 .github/workflows/prefect_orchestraiton.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index b11c43f..db25d2c 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -28,14 +28,14 @@ jobs:
       - name: Build Docker image
         run: |
           cd prefectWorkflows
-          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
+          IMAGE_URI=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest
           echo "Building image $IMAGE_URI"
           docker build -t "$IMAGE_URI"   .
           # Note: The context is the repository root (.), adjust path to Dockerfile if needed.
 
       - name: Push Docker image to Artifact Registry
         run: |
-          IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest"
+          IMAGE_URI=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest
           docker push "$IMAGE_URI"
         # After this step, the image is available in Artifact Registry for Cloud Run to use.
 

From 83365791bee72ed2763e1c722b70dd3b0351a96f Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 16:58:03 -0400
Subject: [PATCH 59/64] gh

---
 .github/workflows/prefect_orchestraiton.yml | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index db25d2c..46505f6 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -12,18 +12,17 @@ jobs:
         uses: actions/checkout@v3
 
       # Authenticate to Google Cloud using the service account JSON key
-      - name: Set up gcloud CLI
-        uses: google-github-actions/setup-gcloud@v1
+      - name: GCP Authentication
+        uses: google-github-actions/auth@v2
         with:
-          service_account_key: ${{ secrets.GCP_KEY }}
-          project_id: ${{ secrets.GCP_PROJECT_ID }}
-          export_default_credentials: true
+          credentials_json: "${{ secrets.GCP_KEY }}"
 
-      - name: Configure Docker auth for Artifact Registry
+      - name: Setup gcloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+
+      - name: Docker login for Artifact Registry
         run: |
-          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev -q
-        # The above command logs Docker into Artifact Registry&#8203;:contentReference[oaicite:10]{index=10}
-        # using the gcloud credentials (no interactive prompt due to -q).
+          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
 
       - name: Build Docker image
         run: |

From 418a062b56ac6d315edb032b1b87504711863867 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 17:10:34 -0400
Subject: [PATCH 60/64] kd

---
 .github/workflows/prefect_orchestraiton.yml | 24 ++++++---------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 46505f6..49b7e06 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -43,26 +43,14 @@ jobs:
         with:
           python-version: "3.10"
 
-      - name: Install Prefect
-        run: pip install prefect==3.1.10
+      - name: Install Prefect3
+        run: pip install --no-cache-dir "prefect>=3.2.4"
 
-      - name: Authenticate with Prefect Cloud
-        env:
-          PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }}
+      - name: Deploy via prefect deploy
         run: |
-          # Use Prefect CLI to log in to Prefect Cloud non-interactively
-          prefect cloud login -k $PREFECT_API_KEY -w "${{ secrets.PREFECT_WORKSPACE }}" || {
-            # Fallback: manually set API URL and API Key if the above doesn't work
-            echo "Using manual Prefect Cloud authentication..."
-            prefect config set PREFECT_API_URL="https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}"
-            prefect config set PREFECT_API_KEY="${{ secrets.PREFECT_API_KEY }}"
-          }
-        # This step authenticates the CLI with Prefect Cloud.
-        # We first attempt `prefect cloud login` (with API key and workspace) for convenience.
-        # If that fails (for example, if workspace name flag is not supported non-interactively),
-        # we fall back to setting PREFECT_API_URL and PREFECT_API_KEY directly&#8203;:contentReference[oaicite:11]{index=11}.
-        # The PREFECT_API_URL uses your Account ID and Workspace ID (from secrets)
-        # to target the correct workspace. This avoids "401 Unauthorized" due to wrong API URL.
+          cd prefectWorkflows
+          prefect deploy -n scraper-cron-deployment \
+            --override image=$IMAGE_URI           # tell pool to use latest image
 
       - name: Deploy Prefect flow
         run: |

From 2a770aad91f70375b81d6862bd4a9077ba12b1d6 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 17:17:50 -0400
Subject: [PATCH 61/64] hkkhjk

---
 .github/workflows/prefect_orchestraiton.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml
index 49b7e06..36116f7 100644
--- a/.github/workflows/prefect_orchestraiton.yml
+++ b/.github/workflows/prefect_orchestraiton.yml
@@ -49,8 +49,7 @@ jobs:
       - name: Deploy via prefect deploy
         run: |
           cd prefectWorkflows
-          prefect deploy -n scraper-cron-deployment \
-            --override image=$IMAGE_URI           # tell pool to use latest image
+          prefect deploy -n scraper-cron-deployment           # tell pool to use latest image
 
       - name: Deploy Prefect flow
         run: |

From d94f95c6d027254254f21dae79e15e99b8ba0202 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 17:26:19 -0400
Subject: [PATCH 62/64] hg

---
 prefectWorkflows/prefect.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml
index 70e2399..879a8bd 100644
--- a/prefectWorkflows/prefect.yaml
+++ b/prefectWorkflows/prefect.yaml
@@ -5,7 +5,7 @@ prefect-version: 3.1.10 # Prefect version to use for this deployment (match your
 deployments:
   - name: scraperflow-deployment # Name of this deployment (appears in Prefect UI)
     description: "Scrapes all URLs and segments data every Saturday at 9:00 UTC"
-    entrypoint: scraper_flow.py:scraperflow # Entry point to the flow: "<script path>:<flow function>"
+    entrypoint: scraper_flow.py:scraper_flow # Entry point to the flow: "<script path>:<flow function>"
     # Cron schedule for every Saturday at 9:00 AM
     schedule:
       cron: "0 9 * * 6" # Cron expression for Saturday 09:00 (UTC)&#8203;:contentReference[oaicite:3]{index=3}

From f70154c1c88bb77904c8b0674bd69f3bbc2e6177 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 17:49:46 -0400
Subject: [PATCH 63/64] ld

---
 prefectWorkflows/.env | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env
index 2e9444b..32d03ba 100644
--- a/prefectWorkflows/.env
+++ b/prefectWorkflows/.env
@@ -7,5 +7,5 @@ BUCKET_NAME=scraped_raw_data_nubot
 RAW_DATA_FOLDER=raw_data
 FAISS_INDEX_FOLDER=faiss_index
 GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json"
-PREFECT_API_KEY=pnu_VuQWNSlXmc2Hqknf
+PREFECT_API_KEY=pnu_mRGcrBkC9qyFbwGfgrVbjbOoL7WIZ411TKYp
 PREFECT_API_URL="https://api.prefect.cloud/api/accounts/806f2e07-5063-4fbe-9b46-0545ad5de2d1/workspaces/acdf9e9e-8a55-446a-ac46-80a3f843d8b6"

From 98467645f295a173525b114ac839172a68058734 Mon Sep 17 00:00:00 2001
From: Nikhil <nikhilkudupudi@gmail.com>
Date: Sun, 20 Apr 2025 21:02:36 -0400
Subject: [PATCH 64/64] update frontend

---
 .../frontend1-docker-image-build.yml          | 45 ++++++++++++++++
 .gitignore                                    |  3 +-
 services/frontend1/.env                       |  2 +-
 services/frontend1/package.json               | 32 +++++++++++
 services/frontend1/public/index.html          | 21 ++++++++
 services/frontend1/src/ChatInterface.js       | 53 +++++++++----------
 6 files changed, 127 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/frontend1-docker-image-build.yml
 create mode 100644 services/frontend1/package.json
 create mode 100644 services/frontend1/public/index.html

diff --git a/.github/workflows/frontend1-docker-image-build.yml b/.github/workflows/frontend1-docker-image-build.yml
new file mode 100644
index 0000000..148f7ab
--- /dev/null
+++ b/.github/workflows/frontend1-docker-image-build.yml
@@ -0,0 +1,45 @@
+name: "build_reactfrontned_image"
+
+on:
+  push:
+    branches:
+      - "**"
+    paths:
+      - "services/frontend1/**"
+jobs:
+  backend_build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: GCP Authentication
+        uses: google-github-actions/auth@v2
+        with:
+          credentials_json: "${{ secrets.GCP_KEY }}"
+
+      - name: Setup gcloud SDK
+        uses: google-github-actions/setup-gcloud@v2
+
+      - name: Docker login for Artifact Registry
+        run: |
+          gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev
+
+      - name: Build and Push Backend Image
+        run: |
+          cd services/frontend1
+          IMAGE=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/react-service:latest
+          docker build -t $IMAGE .
+          docker push $IMAGE
+          cd ../..
+
+      - name: Deploy to Cloud Run
+        run: |
+          gcloud run deploy react-service \
+            --source services/frontend1 \
+            --region ${{ secrets.GCP_REGION }} \
+            --platform managed \
+            --allow-unauthenticated \
+            --memory 1Gi \
+            --timeout 3600s \
+            --set-env-vars "REACT_APP_API_URL=${{secrets.REACT_APP_API_URL}}"
diff --git a/.gitignore b/.gitignore
index 71c9eff..166527d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,6 +177,7 @@ mlflow-artifacts/
 # PyPI configuration file
 .pypirc
 
-*.html
+
 
 *.json
+!package.json
diff --git a/services/frontend1/.env b/services/frontend1/.env
index afe9281..77dfb3a 100644
--- a/services/frontend1/.env
+++ b/services/frontend1/.env
@@ -1 +1 @@
-REACT_APP_API_URL=http://localhost:8000/api/chat
\ No newline at end of file
+REACT_APP_API_URL=https://backend-service-640053216184.us-east1.run.app/NuBot/
\ No newline at end of file
diff --git a/services/frontend1/package.json b/services/frontend1/package.json
new file mode 100644
index 0000000..6a2a3fc
--- /dev/null
+++ b/services/frontend1/package.json
@@ -0,0 +1,32 @@
+{
+  "name": "nubot-frontend",
+  "version": "1.0.0",
+  "description": "NUBot React Frontend",
+  "main": "index.js",
+  "scripts": {
+    "start": "react-scripts start",
+    "build": "react-scripts build",
+    "test": "react-scripts test",
+    "eject": "react-scripts eject"
+  },
+  "dependencies": {
+    "axios": "^1.6.2",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
+    "react-router-dom": "^6.20.0",
+    "react-scripts": "5.0.1",
+    "uuid": "^9.0.1"
+  },
+  "browserslist": {
+    "production": [
+      ">0.2%",
+      "not dead",
+      "not op_mini all"
+    ],
+    "development": [
+      "last 1 chrome version",
+      "last 1 firefox version",
+      "last 1 safari version"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/services/frontend1/public/index.html b/services/frontend1/public/index.html
new file mode 100644
index 0000000..b21e2b4
--- /dev/null
+++ b/services/frontend1/public/index.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <!-- Replace the default favicon with your SVG icon -->
+    <link rel="icon" href="%PUBLIC_URL%/nu_tab_icon.svg" type="image/svg+xml" />
+    <!-- Keep the .ico as fallback for browsers that don't support SVG favicons -->
+    <link rel="alternate icon" href="%PUBLIC_URL%/favicon.ico" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <meta name="theme-color" content="#000000" />
+    <meta
+      name="description"
+      content="NUBot - Your assistant for all things Northeastern!"
+    />
+    <title>NUBot</title>
+  </head>
+  <body>
+    <noscript>You need to enable JavaScript to run this app.</noscript>
+    <div id="root"></div>
+  </body>
+</html>
diff --git a/services/frontend1/src/ChatInterface.js b/services/frontend1/src/ChatInterface.js
index e48f831..e696e38 100644
--- a/services/frontend1/src/ChatInterface.js
+++ b/services/frontend1/src/ChatInterface.js
@@ -4,20 +4,8 @@ import NULogo from './NULogo';
 import BotAvatar from './BotAvatar';
 import UserAvatar from './UserAvatar';
 
-// Sample chat responses for the full chat interface
-const CHAT_RESPONSES = {
-  "hello": "Hi there! I'm NUBot. How can I help you with Northeastern University information today?",
-  "hi": "Hello! I'm NUBot, your Northeastern University assistant. What information can I help you find?",
-  "courses": "Northeastern offers a wide range of courses across various disciplines. Are you looking for courses in a specific department or program?",
-  "faculty": "Northeastern has renowned faculty members across all colleges. Which department or professor are you interested in learning about?",
-  "campus": "Northeastern's main campus is located in Boston, MA. We also have regional campuses in Charlotte, Seattle, San Francisco, Vancouver, Portland ME, and more. Which campus would you like to know more about?",
-  "about": "I'm NUBot, an AI assistant designed to help you navigate Northeastern University information more easily. I can answer questions about courses, faculty, campus resources, and more!",
-  "help": "I can help you find information about Northeastern's academic programs, faculty, campus resources, student services, and more. What would you like to know?",
-  "admission": "Northeastern has different application processes for undergraduate, graduate, and professional programs. Would you like information about a specific program's admission requirements?",
-  "events": "Northeastern hosts various events and activities throughout the year. You can check the university calendar or specific department pages for upcoming events. Is there a particular type of event you're interested in?",
-  "registration": "Course registration typically opens several months before the start of each semester. The exact dates depend on your student status and program. Would you like to know more about the registration process?",
-  "housing": "Northeastern offers various on-campus housing options for students, from traditional residence halls to apartment-style accommodations. Off-campus housing resources are also available through the university. Would you like specific information about housing options?",
-};
+// Load API URL from environment variable
+const API_URL = process.env.REACT_APP_API_URL || 'http://localhost:8080/NuBot';
 
 // Typing indicator component that uses the three dots
 const TypingIndicator = () => {
@@ -47,7 +35,7 @@ const ChatInterface = () => {
     }
   }, [messages, isLoading]); // Also scroll when loading state changes
   
-  const handleSendMessage = () => {
+  const handleSendMessage = async () => {
     if (!userInput.trim() || isLoading) return;
     
     // Add user message to chat
@@ -58,22 +46,33 @@ const ChatInterface = () => {
     // Set loading state
     setIsLoading(true);
     
-    // Process bot response with delay to simulate thinking/processing
-    setTimeout(() => {
-      const lowerCaseInput = userMessage.toLowerCase();
-      let botResponse = "I'm here to help with anything related to Northeastern University. What else would you like to know?";
+    try {
+      // Make API call to get response
+      const response = await fetch(`${API_URL}`, {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({ query: userMessage }),
+      });
       
-      // Check for keyword matches
-      for (const [keyword, response] of Object.entries(CHAT_RESPONSES)) {
-        if (lowerCaseInput.includes(keyword)) {
-          botResponse = response;
-          break;
-        }
+      if (!response.ok) {
+        throw new Error(`API request failed with status: ${response.status}`);
       }
       
-      setMessages(prev => [...prev, { sender: 'bot', message: botResponse }]);
+      const data = await response.json();
+      // The Flask backend returns the response directly, not wrapped in a field
+      setMessages(prev => [...prev, { sender: 'bot', message: data }]);
+    } catch (error) {
+      console.error('Error fetching response:', error);
+      // Add error message to chat
+      setMessages(prev => [...prev, {
+        sender: 'bot',
+        message: "I'm sorry, I'm having trouble connecting to my knowledge base. Please try again later."
+      }]);
+    } finally {
       setIsLoading(false);
-    }, 1500); // Delay to simulate processing - adjust as needed
+    }
   };
   
   const handleKeyPress = (e) => {