From 9aee81a30a78b94df1c58c68f60f65a2932d2e2a Mon Sep 17 00:00:00 2001 From: Nikhil Date: Mon, 14 Apr 2025 12:47:20 -0400 Subject: [PATCH 01/64] docker deployment --- .../workflows/backend-docker-image-build.yml | 20 +++++++++++++++++++ airflow/dags/dataflow/chunk_data.py | 3 ++- airflow/dags/dataflow/scraper.py | 3 ++- airflow/dags/dataflow/store_data.py | 3 ++- prefectWorkflows/dataflow/chunk_data.py | 3 ++- prefectWorkflows/dataflow/scraper.py | 3 ++- prefectWorkflows/dataflow/store_data.py | 3 ++- services/backend/Dockerfile | 3 ++- services/backend/main.py | 6 +++--- services/backend/src/dataflow/chunk_data.py | 15 +++++++++++++- services/backend/src/dataflow/scraper.py | 16 ++++++++++++++- services/backend/src/dataflow/store_data.py | 14 ++++++++++++- 12 files changed, 79 insertions(+), 13 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index f26e491..5f5897e 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -31,3 +31,23 @@ jobs: docker build -t $IMAGE . docker push $IMAGE cd ../.. + - name: gcp deploy + run: | + - name: gcp deploy + run: | + gcloud run deploy backend-service \ + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --set-env-vars \ + AIRFLOW_UID=5000,\ + BASE_URL='https://www.khoury.northeastern.edu/',\ + MAX_DEPTH=3,\ + CONCURRENT_REQUESTS=10,\ + DATA_FOLDER="scraped_data",\ + MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\ + MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\ + BUCKET_NAME=${{ secrets.BUCKET_NAME }},\ + RAW_DATA_FOLDER=raw_data,\ + FAISS_INDEX_FOLDER=faiss_index diff --git a/airflow/dags/dataflow/chunk_data.py b/airflow/dags/dataflow/chunk_data.py index fc730d4..30f3ab4 100644 --- a/airflow/dags/dataflow/chunk_data.py +++ b/airflow/dags/dataflow/chunk_data.py @@ -11,7 +11,8 @@ from dataflow.store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory diff --git a/airflow/dags/dataflow/scraper.py b/airflow/dags/dataflow/scraper.py index 9a0e5f9..d426ece 100644 --- a/airflow/dags/dataflow/scraper.py +++ b/airflow/dags/dataflow/scraper.py @@ -13,7 +13,8 @@ BASE_URL = os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests -GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ') +from google.auth import default +credentials, project = default() # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): diff --git a/airflow/dags/dataflow/store_data.py b/airflow/dags/dataflow/store_data.py index d142be3..95ffa68 100644 --- a/airflow/dags/dataflow/store_data.py +++ b/airflow/dags/dataflow/store_data.py @@ -5,7 +5,8 @@ BUCKET_NAME= os.getenv('BUCKET_NAME') RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() def get_blob_from_bucket(): storage_client = Client() diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py index 643dcba..aa83b8a 100644 --- a/prefectWorkflows/dataflow/chunk_data.py +++ b/prefectWorkflows/dataflow/chunk_data.py @@ -11,7 +11,8 @@ from dataflow.store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py index 9a0e5f9..d426ece 100644 --- a/prefectWorkflows/dataflow/scraper.py +++ b/prefectWorkflows/dataflow/scraper.py @@ -13,7 +13,8 @@ BASE_URL = os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests -GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ') +from google.auth import default +credentials, project = default() # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): diff --git a/prefectWorkflows/dataflow/store_data.py b/prefectWorkflows/dataflow/store_data.py index 039dc22..de6cc16 100644 --- a/prefectWorkflows/dataflow/store_data.py +++ b/prefectWorkflows/dataflow/store_data.py @@ -5,7 +5,8 @@ BUCKET_NAME= os.getenv('BUCKET_NAME') RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() def get_blob_from_bucket(): storage_client = Client() diff --git a/services/backend/Dockerfile b/services/backend/Dockerfile index b6a91f8..a91ae59 100644 --- a/services/backend/Dockerfile +++ b/services/backend/Dockerfile @@ -16,7 +16,8 @@ WORKDIR /app # Pre-copy requirements separately for Docker cache efficiency COPY runtime-requirements.txt . - +RUN pip install --upgrade pip +RUN pip install "huggingface_hub[hf_xet]" # Install Python dependencies RUN pip install --no-cache-dir -r runtime-requirements.txt diff --git a/services/backend/main.py b/services/backend/main.py index 54f6212..056dc8c 100644 --- a/services/backend/main.py +++ b/services/backend/main.py @@ -49,6 +49,6 @@ def post(self): if __name__=="__main__": - PORT=os.getenv('PORT') - HOST=os.getenv('HOST') - app.run(host=HOST,port=PORT,debug=True) \ No newline at end of file + PORT=os.getenv('PORT', 8080) + + app.run(host='0.0.0.0',port=PORT,debug=True) \ No newline at end of file diff --git a/services/backend/src/dataflow/chunk_data.py b/services/backend/src/dataflow/chunk_data.py index 7a43414..265ce1a 100644 --- a/services/backend/src/dataflow/chunk_data.py +++ b/services/backend/src/dataflow/chunk_data.py @@ -11,7 +11,20 @@ from store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py index 69cf26b..f5a99c4 100644 --- a/services/backend/src/dataflow/scraper.py +++ b/services/backend/src/dataflow/scraper.py @@ -13,7 +13,21 @@ BASE_URL = os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests -GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ') + +from google.auth import default +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): diff --git a/services/backend/src/dataflow/store_data.py b/services/backend/src/dataflow/store_data.py index d142be3..837e630 100644 --- a/services/backend/src/dataflow/store_data.py +++ b/services/backend/src/dataflow/store_data.py @@ -5,8 +5,20 @@ BUCKET_NAME= os.getenv('BUCKET_NAME') RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +from google.oauth2 import service_account +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") def get_blob_from_bucket(): storage_client = Client() bucket = storage_client.bucket(BUCKET_NAME) From e5571ae5138644cb92a6b6f9779e1da2c3ed3ca1 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Mon, 14 Apr 2025 15:47:43 -0400 Subject: [PATCH 02/64] update workflow alignment --- .../workflows/backend-docker-image-build.yml | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 5f5897e..62b06ac 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -31,23 +31,22 @@ jobs: docker build -t $IMAGE . docker push $IMAGE cd ../.. - - name: gcp deploy + + - name: Deploy to Cloud Run run: | - - name: gcp deploy - run: | - gcloud run deploy backend-service \ - --source services/backend \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --allow-unauthenticated \ - --set-env-vars \ - AIRFLOW_UID=5000,\ - BASE_URL='https://www.khoury.northeastern.edu/',\ - MAX_DEPTH=3,\ - CONCURRENT_REQUESTS=10,\ - DATA_FOLDER="scraped_data",\ - MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\ - MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\ - BUCKET_NAME=${{ secrets.BUCKET_NAME }},\ - RAW_DATA_FOLDER=raw_data,\ - FAISS_INDEX_FOLDER=faiss_index + gcloud run deploy backend-service \ + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --set-env-vars \ + AIRFLOW_UID=5000,\ + BASE_URL='https://www.khoury.northeastern.edu/',\ + MAX_DEPTH=3,\ + CONCURRENT_REQUESTS=10,\ + DATA_FOLDER="scraped_data",\ + MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\ + MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\ + BUCKET_NAME=${{ secrets.BUCKET_NAME }},\ + RAW_DATA_FOLDER=raw_data,\ + FAISS_INDEX_FOLDER=faiss_index From 0a676d49066caf6da674409c9639eafbfe1ec94b Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Mon, 14 Apr 2025 15:59:04 -0400 Subject: [PATCH 03/64] update envs to single ,s --- .../workflows/backend-docker-image-build.yml | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 62b06ac..995dc93 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -32,21 +32,12 @@ jobs: docker push $IMAGE cd ../.. - - name: Deploy to Cloud Run - run: | - gcloud run deploy backend-service \ - --source services/backend \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --allow-unauthenticated \ - --set-env-vars \ - AIRFLOW_UID=5000,\ - BASE_URL='https://www.khoury.northeastern.edu/',\ - MAX_DEPTH=3,\ - CONCURRENT_REQUESTS=10,\ - DATA_FOLDER="scraped_data",\ - MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},\ - MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},\ - BUCKET_NAME=${{ secrets.BUCKET_NAME }},\ - RAW_DATA_FOLDER=raw_data,\ - FAISS_INDEX_FOLDER=faiss_index + - name: Deploy to Cloud Run + run: | + gcloud run deploy backend-service \ + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" + From c3fc67d8822631d39bec06e9573b084717088d66 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Mon, 14 Apr 2025 16:00:56 -0400 Subject: [PATCH 04/64] divide the steps --- .github/workflows/backend-docker-image-build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 995dc93..91210d5 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -32,9 +32,9 @@ jobs: docker push $IMAGE cd ../.. - - name: Deploy to Cloud Run - run: | - gcloud run deploy backend-service \ + - name: Deploy to Cloud Run + run: | + gcloud run deploy backend-service \ --source services/backend \ --region ${{ secrets.GCP_REGION }} \ --platform managed \ From 45a2ded715b240a571c302693c1228bb366b7b75 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Tue, 15 Apr 2025 15:14:35 -0400 Subject: [PATCH 05/64] update workflows --- .github/workflows/backend-docker-image-build.yml | 13 +++++++------ .github/workflows/frontend-docker-image-build.yml | 11 +++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 91210d5..edbc057 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -35,9 +35,10 @@ jobs: - name: Deploy to Cloud Run run: | gcloud run deploy backend-service \ - --source services/backend \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --allow-unauthenticated \ - --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" - + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 4Gi \ + --timeout 3600s \ + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" diff --git a/.github/workflows/frontend-docker-image-build.yml b/.github/workflows/frontend-docker-image-build.yml index 537cd3c..b547d18 100644 --- a/.github/workflows/frontend-docker-image-build.yml +++ b/.github/workflows/frontend-docker-image-build.yml @@ -31,3 +31,14 @@ jobs: docker build -t $IMAGE . docker push $IMAGE cd ../.. + + - name: Deploy to Cloud Run + run: | + gcloud run deploy frontend-service \ + --source services/frontend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 1Gi \ + --timeout 1800s \ + --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot" From d86bb0498705232385c312ed7f8708957249c18f Mon Sep 17 00:00:00 2001 From: Nikhil Date: Tue, 15 Apr 2025 15:17:31 -0400 Subject: [PATCH 06/64] update alignment --- .github/workflows/backend-docker-image-build.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index edbc057..5ece471 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -35,10 +35,10 @@ jobs: - name: Deploy to Cloud Run run: | gcloud run deploy backend-service \ - --source services/backend \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --allow-unauthenticated \ - --memory 4Gi \ - --timeout 3600s \ - --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 4Gi \ + --timeout 3600s \ + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" From 909e2dcc4c00d12484c414a3348b8501b5cc079b Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Tue, 15 Apr 2025 18:36:30 -0400 Subject: [PATCH 07/64] add docker ignore to front end --- services/frontend/.dockerignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 services/frontend/.dockerignore diff --git a/services/frontend/.dockerignore b/services/frontend/.dockerignore new file mode 100644 index 0000000..4f509e5 --- /dev/null +++ b/services/frontend/.dockerignore @@ -0,0 +1 @@ +*.env \ No newline at end of file From 99d0ea8accfed6cda160311b150689182e418c2a Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Tue, 15 Apr 2025 19:06:55 -0400 Subject: [PATCH 08/64] update condition for workflows --- .github/workflows/backend-docker-image-build.yml | 3 ++- .github/workflows/frontend-docker-image-build.yml | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 5ece471..4cb163c 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -4,7 +4,8 @@ on: push: branches: - "**" - + paths: + - 'services/backend/**' jobs: backend_build: runs-on: ubuntu-latest diff --git a/.github/workflows/frontend-docker-image-build.yml b/.github/workflows/frontend-docker-image-build.yml index b547d18..abd2475 100644 --- a/.github/workflows/frontend-docker-image-build.yml +++ b/.github/workflows/frontend-docker-image-build.yml @@ -4,6 +4,8 @@ on: push: branches: - "**" + paths: + - 'services/frontend/**' jobs: frontend_build: @@ -41,4 +43,4 @@ jobs: --allow-unauthenticated \ --memory 1Gi \ --timeout 1800s \ - --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot" + --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot/" From f32fef7cd231b584f84aac113501c61000824044 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Tue, 15 Apr 2025 23:14:35 -0400 Subject: [PATCH 09/64] update mlflow --- services/backend/.dockerignore | 6 +- services/backend/src/dataflow/rag_model.py | 37 +++++------ services/frontend/.env | 2 +- services/frontend/app.py | 76 ++++++++++++++-------- 4 files changed, 71 insertions(+), 50 deletions(-) diff --git a/services/backend/.dockerignore b/services/backend/.dockerignore index 58329ad..c618940 100644 --- a/services/backend/.dockerignore +++ b/services/backend/.dockerignore @@ -1,3 +1,7 @@ *.env __pycache__/ -scraped_data/ \ No newline at end of file +scraped_data/ +mlruns/ +mlflow.db/ +mlartifacts/ +logs/ \ No newline at end of file diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index fe05420..5d25ac6 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -27,38 +27,27 @@ # Where you currently have this line: mlflow.set_experiment("rag_experiment") def get_or_create_experiment(experiment_name): - - # Check if experiment exists try: experiment = mlflow.get_experiment_by_name(experiment_name) - if experiment is not None: - # Check if experiment is active (not deleted) if experiment.lifecycle_stage == "active": - print(f"Found active experiment '{experiment_name}' with ID: {experiment.experiment_id}") + print(f"✅ Found experiment: {experiment.experiment_id}") return experiment.experiment_id else: - # Experiment exists but is deleted, create a new one with timestamp - new_name = f"{experiment_name}_{int(time.time())}" - experiment_id = mlflow.create_experiment(new_name) - print(f"Original experiment was deleted. Created new experiment '{new_name}' with ID: {experiment_id}") - return experiment_id - else: - # Create new experiment - experiment_id = mlflow.create_experiment(experiment_name) - print(f"Created new experiment '{experiment_name}' with ID: {experiment_id}") - return experiment_id - except Exception as e: - print(f"Error getting or creating experiment: {e}") - # Fallback - create a new experiment with timestamp - new_name = f"{experiment_name}_{int(time.time())}" - experiment_id = mlflow.create_experiment(new_name) - print(f"Created fallback experiment '{new_name}' with ID: {experiment_id}") + print(f"⚠️ Experiment exists but is deleted. Recreating...") + # Create a new experiment (either not found or was deleted) + experiment_id = mlflow.create_experiment(experiment_name) + print(f"🆕 Created experiment '{experiment_name}' with ID: {experiment_id}") return experiment_id + except Exception as e: + print(f"🚨 Exception during experiment creation: {e}") + return None # Replace it with: experiment_id = get_or_create_experiment("rag_experiment") -mlflow.set_experiment_tag("description", "RAG pipeline with Mistral AI model") +if not experiment_id: + raise ValueError("❌ Could not get or create a valid experiment ID. Aborting.") +mlflow.set_tag("description", "RAG pipeline with Mistral AI model") if not os.environ.get("MISTRAL_API_KEY"): os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ") @@ -186,6 +175,10 @@ async def checkModel_fairness(): query=input("generate query") response=generateResponse(query) + print("MLflow URI:", mlflow.get_tracking_uri()) + print("Using experiment ID:", experiment_id) + print("Experiments available:", mlflow.search_experiments()) + print(response) #uncomment and enter prompts for model fairness and there is a limitation on api key # asyncio.run(checkModel_fairness()) \ No newline at end of file diff --git a/services/frontend/.env b/services/frontend/.env index 6148832..7a2998f 100644 --- a/services/frontend/.env +++ b/services/frontend/.env @@ -1 +1 @@ -API_URL="http://localhost:5002" \ No newline at end of file +API_URL="https://backend-service-640053216184.us-east1.run.app/NuBot/" \ No newline at end of file diff --git a/services/frontend/app.py b/services/frontend/app.py index 03a793d..52e392e 100644 --- a/services/frontend/app.py +++ b/services/frontend/app.py @@ -1,30 +1,54 @@ import streamlit as st import requests -from dotenv import load_dotenv import os +from dotenv import load_dotenv + load_dotenv(override=True) -# Define the backend API URL -API_URL = os.getenv('API_URL') # Update if running on a different host - -st.title("NuBot Chat Interface") -st.markdown("### Ask NuBot any question!") - -# User input -query = st.text_input("Enter your query:") - -if st.button("Submit"): - if query: - try: - # Send request to the backend API - response = requests.post(API_URL, json={"query": query}) - - # Display the response - if response.status_code == 200: - st.success("Response from NuBot:") - st.write(response.json()) - else: - st.error(f"Error {response.status_code}: {response.text}") - except requests.exceptions.RequestException as e: - st.error(f"Request failed: {e}") - else: - st.warning("Please enter a query before submitting.") + +# Load API URL from environment variable +API_URL = os.getenv("API_URL", "http://localhost:8000/api/chat") + +# Streamlit app config +st.set_page_config(page_title="NU Bot", page_icon="🤖", layout="centered") +st.title("🤖 NU Bot") +st.markdown("### Smart Chatbot for Northeastern University") + +# Input field for user questions +user_input = st.text_input("Ask NU Bot a question:", "") + +# Display response +if user_input: + try: + response = requests.post(API_URL, json={"query": user_input},verify=False) + # print(response.json()) + if response.status_code == 200: + answer = response.json().get("answer", "No answer provided.") + else: + answer = f"Error: {response.status_code} - {response.text}" + except Exception as e: + answer = f"An error occurred: {e}" + + st.markdown(f"**NU Bot says:** {answer}") + +# Technologies +with st.expander("🔧 Technologies Used"): + st.markdown(""" + - Google Cloud Platform (GCP) + - Mistral AI + - Python + - GitHub + """) + +# Features +with st.expander("💡 Features"): + st.markdown(""" + - Interactive chatbot interface + - Smart Q&A using university web data + - Real-time responses + - Scalable architecture + """) + +# Contact +with st.expander("📬 Contact Us"): + st.markdown("Email us at [nubot@northeastern.edu](mailto:nubot@northeastern.edu)") + From 8c9aa36b4488e05538374e3f38a5800ac728dec8 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Tue, 15 Apr 2025 23:33:38 -0400 Subject: [PATCH 10/64] revert frontend --- services/frontend/app.py | 76 ++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/services/frontend/app.py b/services/frontend/app.py index 52e392e..03a793d 100644 --- a/services/frontend/app.py +++ b/services/frontend/app.py @@ -1,54 +1,30 @@ import streamlit as st import requests -import os from dotenv import load_dotenv - +import os load_dotenv(override=True) - -# Load API URL from environment variable -API_URL = os.getenv("API_URL", "http://localhost:8000/api/chat") - -# Streamlit app config -st.set_page_config(page_title="NU Bot", page_icon="🤖", layout="centered") -st.title("🤖 NU Bot") -st.markdown("### Smart Chatbot for Northeastern University") - -# Input field for user questions -user_input = st.text_input("Ask NU Bot a question:", "") - -# Display response -if user_input: - try: - response = requests.post(API_URL, json={"query": user_input},verify=False) - # print(response.json()) - if response.status_code == 200: - answer = response.json().get("answer", "No answer provided.") - else: - answer = f"Error: {response.status_code} - {response.text}" - except Exception as e: - answer = f"An error occurred: {e}" - - st.markdown(f"**NU Bot says:** {answer}") - -# Technologies -with st.expander("🔧 Technologies Used"): - st.markdown(""" - - Google Cloud Platform (GCP) - - Mistral AI - - Python - - GitHub - """) - -# Features -with st.expander("💡 Features"): - st.markdown(""" - - Interactive chatbot interface - - Smart Q&A using university web data - - Real-time responses - - Scalable architecture - """) - -# Contact -with st.expander("📬 Contact Us"): - st.markdown("Email us at [nubot@northeastern.edu](mailto:nubot@northeastern.edu)") - +# Define the backend API URL +API_URL = os.getenv('API_URL') # Update if running on a different host + +st.title("NuBot Chat Interface") +st.markdown("### Ask NuBot any question!") + +# User input +query = st.text_input("Enter your query:") + +if st.button("Submit"): + if query: + try: + # Send request to the backend API + response = requests.post(API_URL, json={"query": query}) + + # Display the response + if response.status_code == 200: + st.success("Response from NuBot:") + st.write(response.json()) + else: + st.error(f"Error {response.status_code}: {response.text}") + except requests.exceptions.RequestException as e: + st.error(f"Request failed: {e}") + else: + st.warning("Please enter a query before submitting.") From 0baa07da6c9756c48b7363183054476a592ae22b Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Wed, 16 Apr 2025 12:41:44 -0400 Subject: [PATCH 11/64] check mlflow issue --- services/backend/src/dataflow/rag_model.py | 38 +++++++++++++++------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index 5d25ac6..7b20b6b 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -25,7 +25,7 @@ FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # Remote MLflow Server # Where you currently have this line: -mlflow.set_experiment("rag_experiment") + def get_or_create_experiment(experiment_name): try: experiment = mlflow.get_experiment_by_name(experiment_name) @@ -45,8 +45,15 @@ def get_or_create_experiment(experiment_name): # Replace it with: experiment_id = get_or_create_experiment("rag_experiment") -if not experiment_id: - raise ValueError("❌ Could not get or create a valid experiment ID. Aborting.") +def ensure_experiment(name): + try: + mlflow.set_experiment(name) + except RestException: + mlflow.create_experiment(name) + mlflow.set_experiment(name) + +ensure_experiment("rag_experiment") +# mlflow.set_experiment("rag_experiment") mlflow.set_tag("description", "RAG pipeline with Mistral AI model") if not os.environ.get("MISTRAL_API_KEY"): os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ") @@ -60,13 +67,20 @@ def get_llm(): def get_prompt(): # Define prompt for question-answering # Your prompt template - template = """Use the following pieces of context to answer the question at the end. -If you don't know the answer, just say that you don't know, don't try to make up an answer. -Use three sentences maximum and keep the answer as concise as possible. -Always say "thanks for asking!" at the end of the answer. + template = """You are an expert assistant helping to answer questions based only on the given context. + +Instructions: +- Use ONLY the context below to answer. +- If the context does not contain the answer, say: "I don't know based on the available information." +- Answer in 2-3 sentences, clearly and factually. +- End your response with: "Thanks for asking!" + +Context: {context} + Question: {question} -Helpful Answer:""" + +Answer:""" custom_rag_prompt = PromptTemplate.from_template(template) return custom_rag_prompt @@ -102,9 +116,9 @@ def load_embeddings(): vector_store = FAISS.load_local(FAISS_INDEX_FOLDER, embeddings, allow_dangerous_deserialization=True) # Define application steps def retrieve(state: State): - with mlflow.start_run(nested=True, run_name="retrieval",experiment_id=experiment_id): + with mlflow.start_run(nested=True, run_name="retrieval"): start_time = time.time() - retrieved_docs = vector_store.similarity_search(state["question"]) + retrieved_docs = vector_store.similarity_search(state["question"],k=10) retrieval_time = time.time() - start_time # Extract only metadata @@ -123,7 +137,7 @@ def retrieve(state: State): # Initialize prompt once and store in a global variable prompt = get_prompt() def generate(state: State): - with mlflow.start_run(nested=True, run_name="generation",experiment_id=experiment_id): + with mlflow.start_run(nested=True, run_name="generation"): start_time = time.time() docs_content = "\n\n".join(doc.page_content for doc in state["context"]) token_count = len(docs_content.split()) @@ -150,7 +164,7 @@ def generate(state: State): def generateResponse(query): # Compile application and test try: - with mlflow.start_run(run_name="RAG_Pipeline",experiment_id=experiment_id): + with mlflow.start_run(run_name="RAG_Pipeline"): mlflow.log_param("query", query) graph_builder = StateGraph(State).add_sequence([retrieve, generate]) graph_builder.add_edge(START, "retrieve") From 26f42966f820d9628be1e43320523cc7109503b8 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Wed, 16 Apr 2025 12:47:53 -0400 Subject: [PATCH 12/64] update file safe name for scraper files --- services/backend/src/dataflow/scraper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py index f5a99c4..32c696f 100644 --- a/services/backend/src/dataflow/scraper.py +++ b/services/backend/src/dataflow/scraper.py @@ -38,7 +38,8 @@ def safe_filename(url): parsed = urlparse(url) path = parsed.path.strip('/') or 'index' filename = re.sub(r'[^A-Za-z0-9_\-]', '_', path) + ".json" - return os.path.join(DATA_FOLDER, filename) + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + return os.path.join(DATA_FOLDER, f"{filename}_{url_hash}.json") async def fetch(session, url, semaphore): """Fetch the content of the URL asynchronously.""" From e5766cb3f229a7655ea6f8850ba8e50185bc5343 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Wed, 16 Apr 2025 12:48:32 -0400 Subject: [PATCH 13/64] add package --- services/backend/src/dataflow/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py index 32c696f..b73f4ab 100644 --- a/services/backend/src/dataflow/scraper.py +++ b/services/backend/src/dataflow/scraper.py @@ -6,7 +6,7 @@ import re from urllib.parse import urljoin, urlparse from dotenv import load_dotenv - +import hashlib from store_data import upload_many_blobs_with_transfer_manager load_dotenv(override=True) # Configuration From 6b774c5564470105892bf653e4a14b06934127df Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Wed, 16 Apr 2025 18:14:10 -0400 Subject: [PATCH 14/64] try mlflow fix of id tracking --- services/backend/src/dataflow/rag_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index 7b20b6b..b3d0833 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -44,17 +44,18 @@ def get_or_create_experiment(experiment_name): return None # Replace it with: -experiment_id = get_or_create_experiment("rag_experiment") + def ensure_experiment(name): try: mlflow.set_experiment(name) - except RestException: + except Exception as e: mlflow.create_experiment(name) mlflow.set_experiment(name) ensure_experiment("rag_experiment") # mlflow.set_experiment("rag_experiment") mlflow.set_tag("description", "RAG pipeline with Mistral AI model") +# experiment_id = get_or_create_experiment("rag_experiment") if not os.environ.get("MISTRAL_API_KEY"): os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ") From 06734317aff120286f8264f0659057fa250df6a9 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Wed, 16 Apr 2025 20:59:58 -0400 Subject: [PATCH 15/64] second revision of mlflow --- services/backend/src/dataflow/rag_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index b3d0833..23327a8 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -165,6 +165,7 @@ def generate(state: State): def generateResponse(query): # Compile application and test try: + ensure_experiment("rag_experiment") with mlflow.start_run(run_name="RAG_Pipeline"): mlflow.log_param("query", query) graph_builder = StateGraph(State).add_sequence([retrieve, generate]) @@ -189,6 +190,7 @@ async def checkModel_fairness(): if __name__ == "__main__": query=input("generate query") + ensure_experiment("rag_experiment") response=generateResponse(query) print("MLflow URI:", mlflow.get_tracking_uri()) print("Using experiment ID:", experiment_id) From f0e2fabd078a1506a3e93e3ac9ebfd1cb0531a25 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Thu, 17 Apr 2025 00:30:57 -0400 Subject: [PATCH 16/64] fix alignment --- services/backend/src/dataflow/rag_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index 23327a8..49c3581 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -166,7 +166,7 @@ def generateResponse(query): # Compile application and test try: ensure_experiment("rag_experiment") - with mlflow.start_run(run_name="RAG_Pipeline"): + with mlflow.start_run(run_name="RAG_Pipeline"): mlflow.log_param("query", query) graph_builder = StateGraph(State).add_sequence([retrieve, generate]) graph_builder.add_edge(START, "retrieve") From 2c9b53479f514378b5e74fba8f709d4a1c90cefc Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 18:15:36 -0400 Subject: [PATCH 17/64] add multiple urls scraping code --- services/backend/.env | 1 + services/backend/runtime-requirements.txt | 1 + services/backend/src/dataflow/rag_model.py | 5 +++-- services/backend/src/dataflow/scraper.py | 25 ++++++++++++++------- services/backend/src/dataflow/store_data.py | 2 +- 5 files changed, 23 insertions(+), 11 deletions(-) diff --git a/services/backend/.env b/services/backend/.env index 7cc284f..60515d6 100644 --- a/services/backend/.env +++ b/services/backend/.env @@ -5,6 +5,7 @@ CONCURRENT_REQUESTS = 10 DATA_FOLDER = "scraped_data" MISTRAL_API_KEY="1UTJndGP95gDBZopQkDojiZ5bCzSJG9p" MLFLOW_TRACKING_URI="http://localhost:5000" +URLS_LIST= "https://www.khoury.northeastern.edu/" PORT=8080 HOST=127.0.0.1 BUCKET_NAME=scraped_raw_data_nubot diff --git a/services/backend/runtime-requirements.txt b/services/backend/runtime-requirements.txt index 4ff0497..953f6d0 100644 --- a/services/backend/runtime-requirements.txt +++ b/services/backend/runtime-requirements.txt @@ -13,6 +13,7 @@ langchain-community langgraph google-cloud-storage datasets +gcsfs beautifulsoup4 aiohttp flask-cors diff --git a/services/backend/src/dataflow/rag_model.py b/services/backend/src/dataflow/rag_model.py index 49c3581..e6a2fb9 100644 --- a/services/backend/src/dataflow/rag_model.py +++ b/services/backend/src/dataflow/rag_model.py @@ -71,8 +71,9 @@ def get_prompt(): template = """You are an expert assistant helping to answer questions based only on the given context. Instructions: -- Use ONLY the context below to answer. -- If the context does not contain the answer, say: "I don't know based on the available information." +- Use the context and search in below to answer. +- you can search https://www.khoury.northeastern.edu/ for answering better if not found any in context +- If context and the website provided does not contain the answer, say: "I don't know based on the available information." - Answer in 2-3 sentences, clearly and factually. - End your response with: "Thanks for asking!" diff --git a/services/backend/src/dataflow/scraper.py b/services/backend/src/dataflow/scraper.py index b73f4ab..0003587 100644 --- a/services/backend/src/dataflow/scraper.py +++ b/services/backend/src/dataflow/scraper.py @@ -10,7 +10,9 @@ from store_data import upload_many_blobs_with_transfer_manager load_dotenv(override=True) # Configuration -BASE_URL = os.getenv('BASE_URL') +URLS_LIST=list(os.getenv('URLS_LIST','').split(",")) + +# BASE_URL ="" #URLS_LIST[0]#os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests @@ -54,7 +56,7 @@ async def fetch(session, url, semaphore): print(f"Error fetching {url}: {e}") return None -async def async_scrape(url, depth=0, session=None, semaphore=None): +async def async_scrape(url,BASE_URL, depth=0, session=None, semaphore=None): """Recursively scrape pages asynchronously and store in JSON format.""" if depth > MAX_DEPTH: return @@ -96,26 +98,33 @@ async def async_scrape(url, depth=0, session=None, semaphore=None): next_url = urljoin(url, link['href']) if urlparse(next_url).netloc == urlparse(BASE_URL).netloc: next_url = next_url.split('#')[0] # Remove fragments - tasks.append(async_scrape(next_url, depth + 1, session, semaphore)) + tasks.append(async_scrape(next_url,BASE_URL, depth + 1, session, semaphore)) if tasks: await asyncio.gather(*tasks) -async def scrape_and_load(): +async def scrape_and_load(CURRENT_URl): """Main function to initiate scraping.""" semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) async with aiohttp.ClientSession() as session: - await async_scrape(BASE_URL, depth=0, session=session, semaphore=semaphore) + await async_scrape(CURRENT_URl,BASE_URL=CURRENT_URl, depth=0, session=session, semaphore=semaphore) def scrape_and_load_task(): - asyncio.run(scrape_and_load()) + for url in URLS_LIST: + BASE_URL=url + asyncio.run(scrape_and_load(BASE_URL)) + print("*"*15) + print(f"scraping {url} done") + print("*"*15) + upload_many_blobs_with_transfer_manager() return if __name__ == '__main__': - asyncio.run(scrape_and_load()) - upload_many_blobs_with_transfer_manager() \ No newline at end of file + scrape_and_load_task() + # asyncio.run(scrape_and_load()) + # upload_many_blobs_with_transfer_manager() \ No newline at end of file diff --git a/services/backend/src/dataflow/store_data.py b/services/backend/src/dataflow/store_data.py index 837e630..e89ad85 100644 --- a/services/backend/src/dataflow/store_data.py +++ b/services/backend/src/dataflow/store_data.py @@ -68,7 +68,7 @@ def upload_many_blobs_with_transfer_manager( storage_client = Client() bucket = storage_client.bucket(BUCKET_NAME) - source_directory=os.path.join("..","..","scraped_data") + source_directory=os.path.join("scraped_data") filenames = [f for f in os.listdir(source_directory) if f.endswith(".json")] for filename in filenames: file_path = os.path.join(source_directory, filename) From 0e95868b93d0ee74cf0647e1708d6f9305499ff4 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Fri, 18 Apr 2025 18:16:36 -0400 Subject: [PATCH 18/64] Update backend-docker-image-build.yml --- .github/workflows/backend-docker-image-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index 4cb163c..54e8976 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -42,4 +42,4 @@ jobs: --allow-unauthenticated \ --memory 4Gi \ --timeout 3600s \ - --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index" + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/"" From d4640831c2ae7e68216838461588c04d9db9d9ec Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:40:33 -0400 Subject: [PATCH 19/64] update prefect deployment flow --- .github/workflows/prefect_orchestraiton.yml | 56 +++++++ prefectWorkflows/.dockerignore | 3 + prefectWorkflows/.env | 4 +- prefectWorkflows/Dockerfile | 21 +++ prefectWorkflows/dataflow/chunk_data.py | 18 ++- prefectWorkflows/dataflow/rag_model.py | 149 ------------------ prefectWorkflows/dataflow/scraper.py | 47 ++++-- prefectWorkflows/dataflow/store_data.py | 15 +- prefectWorkflows/{scraper_flow.py => flow.py} | 4 + prefectWorkflows/requirements.txt | 17 ++ 10 files changed, 167 insertions(+), 167 deletions(-) create mode 100644 .github/workflows/prefect_orchestraiton.yml create mode 100644 prefectWorkflows/.dockerignore create mode 100644 prefectWorkflows/Dockerfile delete mode 100644 prefectWorkflows/dataflow/rag_model.py rename prefectWorkflows/{scraper_flow.py => flow.py} (85%) create mode 100644 prefectWorkflows/requirements.txt diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml new file mode 100644 index 0000000..6db5d9e --- /dev/null +++ b/.github/workflows/prefect_orchestraiton.yml @@ -0,0 +1,56 @@ +name: Deploy Prefect Flow to Cloud Run + +on: + push: + branches: + - "**" # run on commits to main (adjust branch name as needed) + workflow_dispatch: # allow manual trigger from the Actions tab if needed + +jobs: + build-deploy: + runs-on: ubuntu-latest + + steps: + # Step 1: Check out repository code + - name: Checkout code + uses: actions/checkout@v3 + + # Step 2: Set up gcloud CLI + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v1 + with: + credentials_json: "${{ secrets.GCP_KEY }}" + + # Step 3: Configure Docker auth for Artifact Registry + - name: Docker login for Artifact Registry + run: | + gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + + # Step 4: Build the Docker image + - name: Build Docker image + run: | + docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest . + + # Step 5: Push the image to Artifact Registry + - name: Push Docker image + run: | + docker push ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest + + # Step 6: Install Prefect in the runner (for CLI use) + - name: Install Prefect + run: pip install --no-cache prefect==2.* + + # Step 7: Register/Update Prefect deployment + - name: Register Prefect deployment + # Use Prefect CLI to build and apply the deployment + run: | + prefect deployment build flow.py:scraperflow -n "ScraperFlow Weekly" \ + --pool my-cloud-run-pool -q default \ + --cron "0 9 * * 6" --skip-upload --apply + env: + PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} + + # Step 8: (Optional) Confirm success + - name: Deployment successful + run: echo "Prefect flow deployment updated and scheduled successfully." diff --git a/prefectWorkflows/.dockerignore b/prefectWorkflows/.dockerignore new file mode 100644 index 0000000..3168a00 --- /dev/null +++ b/prefectWorkflows/.dockerignore @@ -0,0 +1,3 @@ +*.env +scraped_data/ +faiss_index/ \ No newline at end of file diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env index 29ab14c..0b2b74e 100644 --- a/prefectWorkflows/.env +++ b/prefectWorkflows/.env @@ -6,4 +6,6 @@ DATA_FOLDER = "scraped_data" BUCKET_NAME=scraped_raw_data_nubot RAW_DATA_FOLDER=raw_data FAISS_INDEX_FOLDER=faiss_index -GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json" \ No newline at end of file +GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json" +PREFECT_API_KEY=pnu_VuQWNSlXmc2Hqknf +PREFECT_API_URL="https://api.prefect.cloud/api/accounts-8a55-446a-ac46-80a3f843d8b6" \ No newline at end of file diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile new file mode 100644 index 0000000..5d3a8c7 --- /dev/null +++ b/prefectWorkflows/Dockerfile @@ -0,0 +1,21 @@ +# Use a lightweight Python base image +FROM python:3.10-slim + +# Set working directory in container +WORKDIR /opt/prefect/project + +# Copy requirements and install them (if you have a requirements.txt or pyproject.toml) +COPY requirements.txt ./ +RUN pip install -U pip && pip install -r requirements.txt + +# Install Prefect (if not already included in requirements) +RUN pip install prefect==2.* + +# Copy the Prefect flow code and related modules into the image +COPY . . + +# (Optional) If your flow code is a package with setup.py, you could RUN pip install . instead. + +# Set the default command (entrypoint) for the container to do nothing by default. +# Prefect will override this when running the flow, so we keep it simple. +CMD ["python", "-c", "print('Prefect flow container ready')"] diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py index aa83b8a..265ce1a 100644 --- a/prefectWorkflows/dataflow/chunk_data.py +++ b/prefectWorkflows/dataflow/chunk_data.py @@ -8,11 +8,23 @@ from dotenv import load_dotenv from google.cloud.storage import Client -from dataflow.store_data import upload_faiss_index_to_bucket +from store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') from google.auth import default -credentials, project = default() +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory @@ -53,4 +65,4 @@ def chunk_data(): if __name__=="__main__": chunk_data() - + upload_faiss_index_to_bucket() \ No newline at end of file diff --git a/prefectWorkflows/dataflow/rag_model.py b/prefectWorkflows/dataflow/rag_model.py deleted file mode 100644 index ffcfa6c..0000000 --- a/prefectWorkflows/dataflow/rag_model.py +++ /dev/null @@ -1,149 +0,0 @@ -from functools import lru_cache -from langchain import hub -from langchain_core.documents import Document -from langgraph.graph import START, StateGraph -from typing_extensions import List, TypedDict -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain.chat_models import init_chat_model -from langchain_community.vectorstores import FAISS -import getpass -import os -from dotenv import load_dotenv -import mlflow -import time -from langfair.auto import AutoEval -import asyncio -# Load the FAISS index -from google.cloud.storage import Client -import tempfile -import os -load_dotenv(override=True) -mlflow.langchain.autolog() -MLFLOW_TRACKING_URI =os.environ.get("MLFLOW_TRACKING_URI") -MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") -FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # Remote MLflow Server -mlflow.set_experiment("rag_experiment") -if not os.environ.get("MISTRAL_API_KEY"): - os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ") - -@lru_cache(maxsize=None) -def get_llm(): - llm = init_chat_model("mistral-large-latest", model_provider="mistralai") - return llm - -@lru_cache(maxsize=None) -def get_prompt(): -# Define prompt for question-answering - prompt = hub.pull("rlm/rag-prompt") - return prompt - - -# Define state for application -class State(TypedDict): - question: str - context: List[Document] - answer: str - - -@lru_cache(maxsize=None) -def load_embeddings(): - embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") - return embeddings - - - -# Initialize GCS client -storage_client = Client() -bucket=storage_client.bucket(os.getenv('BUCKET_NAME')) -embeddings=load_embeddings() -if not os.path.exists(FAISS_INDEX_FOLDER): - os.makedirs(FAISS_INDEX_FOLDER, exist_ok=True) -# Create a temporary directory -# Download FAISS index files from bucket to FAISS_INDEX_FOLDER directory -for blob in bucket.list_blobs(prefix=FAISS_INDEX_FOLDER): - # Extract just the filename from the full path - filename = os.path.basename(blob.name) - local_path = os.path.join(FAISS_INDEX_FOLDER, filename) - blob.download_to_filename(local_path) - -# Load FAISS index from directory -vector_store = FAISS.load_local(FAISS_INDEX_FOLDER, embeddings, allow_dangerous_deserialization=True) -# Define application steps -def retrieve(state: State): - with mlflow.start_run(nested=True, run_name="retrieval"): - start_time = time.time() - retrieved_docs = vector_store.similarity_search(state["question"]) - retrieval_time = time.time() - start_time - - # Extract only metadata - doc_metadata = [{"doc_id": doc.metadata.get("id", i), "source": doc.metadata.get("source", "unknown")} - for i, doc in enumerate(retrieved_docs)] - - # Log metadata instead of full documents - mlflow.log_metric("retrieval_time", retrieval_time) - mlflow.log_param("retrieved_docs_count", len(retrieved_docs)) - mlflow.log_dict(doc_metadata, "retrieved_docs.json") - - return {"context": retrieved_docs} - -# Initialize LLM once and store in a global variable -llm = get_llm() -# Initialize prompt once and store in a global variable -prompt = get_prompt() -def generate(state: State): - with mlflow.start_run(nested=True, run_name="generation"): - start_time = time.time() - docs_content = "\n\n".join(doc.page_content for doc in state["context"]) - token_count = len(docs_content.split()) - # Use the global prompt instance - mlflow.log_param("retrieved_tokens", token_count) - mlflow.log_param("context_length", len(docs_content)) - messages = prompt.invoke({"question": state["question"], "context": docs_content}) - response = llm.invoke(messages) - generation_time = time.time() - start_time - - # Log LLM generation performance - mlflow.log_metric("generation_time", generation_time) - mlflow.log_param("response_length", len(response.content.split())) - mlflow.log_param("model_name", "mistral-large-latest") - - # Save response - # with open("response.txt", "w") as f: - # f.write(response.content) - # mlflow.log_artifact("response.txt") - - return {"answer": response.content} - - -def generateResponse(query): -# Compile application and test - try: - with mlflow.start_run(run_name="RAG_Pipeline"): - mlflow.log_param("query", query) - graph_builder = StateGraph(State).add_sequence([retrieve, generate]) - graph_builder.add_edge(START, "retrieve") - graph = graph_builder.compile() - response = graph.invoke({"question": f"{query}"}) - mlflow.log_param("final_answer", response["answer"]) - return response["answer"] - except Exception as e: - mlflow.log_param("error", str(e)) - raise Exception(e) - -async def checkModel_fairness(): - auto_object = AutoEval( - prompts=["tell me about khoury"], - langchain_llm=llm, - # toxicity_device=device # uncomment if GPU is available - ) - results = await auto_object.evaluate() - print(results['metrics']) - -if __name__ == "__main__": - - query=input("generate query") - response=generateResponse(query) - print(response) - #uncomment and enter prompts for model fairness and there is a limitation on api key - # asyncio.run(checkModel_fairness()) \ No newline at end of file diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py index d426ece..0003587 100644 --- a/prefectWorkflows/dataflow/scraper.py +++ b/prefectWorkflows/dataflow/scraper.py @@ -6,15 +6,30 @@ import re from urllib.parse import urljoin, urlparse from dotenv import load_dotenv - -from dataflow.store_data import upload_many_blobs_with_transfer_manager +import hashlib +from store_data import upload_many_blobs_with_transfer_manager load_dotenv(override=True) # Configuration -BASE_URL = os.getenv('BASE_URL') +URLS_LIST=list(os.getenv('URLS_LIST','').split(",")) + +# BASE_URL ="" #URLS_LIST[0]#os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests + from google.auth import default -credentials, project = default() +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): @@ -25,7 +40,8 @@ def safe_filename(url): parsed = urlparse(url) path = parsed.path.strip('/') or 'index' filename = re.sub(r'[^A-Za-z0-9_\-]', '_', path) + ".json" - return os.path.join(DATA_FOLDER, filename) + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + return os.path.join(DATA_FOLDER, f"{filename}_{url_hash}.json") async def fetch(session, url, semaphore): """Fetch the content of the URL asynchronously.""" @@ -40,7 +56,7 @@ async def fetch(session, url, semaphore): print(f"Error fetching {url}: {e}") return None -async def async_scrape(url, depth=0, session=None, semaphore=None): +async def async_scrape(url,BASE_URL, depth=0, session=None, semaphore=None): """Recursively scrape pages asynchronously and store in JSON format.""" if depth > MAX_DEPTH: return @@ -82,26 +98,33 @@ async def async_scrape(url, depth=0, session=None, semaphore=None): next_url = urljoin(url, link['href']) if urlparse(next_url).netloc == urlparse(BASE_URL).netloc: next_url = next_url.split('#')[0] # Remove fragments - tasks.append(async_scrape(next_url, depth + 1, session, semaphore)) + tasks.append(async_scrape(next_url,BASE_URL, depth + 1, session, semaphore)) if tasks: await asyncio.gather(*tasks) -async def scrape_and_load(): +async def scrape_and_load(CURRENT_URl): """Main function to initiate scraping.""" semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) async with aiohttp.ClientSession() as session: - await async_scrape(BASE_URL, depth=0, session=session, semaphore=semaphore) + await async_scrape(CURRENT_URl,BASE_URL=CURRENT_URl, depth=0, session=session, semaphore=semaphore) def scrape_and_load_task(): - asyncio.run(scrape_and_load()) + for url in URLS_LIST: + BASE_URL=url + asyncio.run(scrape_and_load(BASE_URL)) + print("*"*15) + print(f"scraping {url} done") + print("*"*15) + upload_many_blobs_with_transfer_manager() return if __name__ == '__main__': - asyncio.run(scrape_and_load()) - upload_many_blobs_with_transfer_manager() \ No newline at end of file + scrape_and_load_task() + # asyncio.run(scrape_and_load()) + # upload_many_blobs_with_transfer_manager() \ No newline at end of file diff --git a/prefectWorkflows/dataflow/store_data.py b/prefectWorkflows/dataflow/store_data.py index de6cc16..e89ad85 100644 --- a/prefectWorkflows/dataflow/store_data.py +++ b/prefectWorkflows/dataflow/store_data.py @@ -6,8 +6,19 @@ RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') from google.auth import default -credentials, project = default() +from google.oauth2 import service_account +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") def get_blob_from_bucket(): storage_client = Client() bucket = storage_client.bucket(BUCKET_NAME) @@ -20,7 +31,7 @@ def get_blob_from_bucket(): def upload_many_blobs_with_transfer_manager( - + workers=8 ): """Upload every file in a list to a bucket, concurrently in a process pool. diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/flow.py similarity index 85% rename from prefectWorkflows/scraper_flow.py rename to prefectWorkflows/flow.py index b82f9a8..2a7ec09 100644 --- a/prefectWorkflows/scraper_flow.py +++ b/prefectWorkflows/flow.py @@ -1,7 +1,11 @@ from prefect import flow, task from dataflow.scraper import scrape_and_load_task from dataflow.chunk_data import chunk_data +from dotenv import load_dotenv +import os +load_dotenv(override=True) +PREFECT_API_KEY=os.getenv('PREFECT_API_KEY') @task def scrape_all_urls_task(): diff --git a/prefectWorkflows/requirements.txt b/prefectWorkflows/requirements.txt new file mode 100644 index 0000000..c4a2008 --- /dev/null +++ b/prefectWorkflows/requirements.txt @@ -0,0 +1,17 @@ +python-dotenv +requests +transformers==4.48.0 +sentence-transformers +torch +faiss-cpu +mlflow +langchain[mistralai] +langchain-community +langgraph +google-cloud-storage +datasets +gcsfs +beautifulsoup4 +aiohttp +langchain-huggingface +langfair From ae83ad5565418993c0a4d4a04e243d5f5f68d771 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:43:12 -0400 Subject: [PATCH 20/64] update path --- .github/workflows/prefect_orchestraiton.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 6db5d9e..14ade22 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -4,6 +4,8 @@ on: push: branches: - "**" # run on commits to main (adjust branch name as needed) + paths: + - "prefectWorkflows/**" workflow_dispatch: # allow manual trigger from the Actions tab if needed jobs: From 459e854426a934ca1139fac4bcc5a8622563b8f8 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:44:01 -0400 Subject: [PATCH 21/64] update prefect workflow --- .github/workflows/prefect_orchestraiton.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 14ade22..d6d22ed 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -31,6 +31,7 @@ jobs: # Step 4: Build the Docker image - name: Build Docker image run: | + cd prefectWorkflows docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest . # Step 5: Push the image to Artifact Registry From 29e359c88dce3be0f7d0f861a1d707d864886ccf Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:55:32 -0400 Subject: [PATCH 22/64] add tgcloud run deploy command --- .github/workflows/prefect_orchestraiton.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index d6d22ed..8a02dcd 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -57,3 +57,24 @@ jobs: # Step 8: (Optional) Confirm success - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." + deploy-worker: + needs: build-and-deploy-image + runs-on: ubuntu-latest + steps: + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v1 + with: + credentials_json: "${{ secrets.GCP_KEY }}" + + - name: Deploy Prefect worker to Cloud Run + run: | + gcloud run deploy prefect-worker \ + --image=prefecthq/prefect:3-latest \ + --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \ + --no-cpu-throttling \ + --platform managed \ + --allow-unauthenticated \ + --memory 4Gi \ + --timeout 3600s \ + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/",PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} " + --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run" From 560f113455ca09b6af5030c14365abd3085fd809 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:56:33 -0400 Subject: [PATCH 23/64] temporarily disable path tracking --- .github/workflows/prefect_orchestraiton.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 8a02dcd..f0e0933 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -4,8 +4,8 @@ on: push: branches: - "**" # run on commits to main (adjust branch name as needed) - paths: - - "prefectWorkflows/**" + # paths: + # - "prefectWorkflows/**" workflow_dispatch: # allow manual trigger from the Actions tab if needed jobs: From 3274482574b8bf0fd5980625b8c6a2951426d673 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:57:25 -0400 Subject: [PATCH 24/64] update --- .github/workflows/prefect_orchestraiton.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index f0e0933..c99162f 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -58,7 +58,7 @@ jobs: - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." deploy-worker: - needs: build-and-deploy-image + needs: build-and-deploy runs-on: ubuntu-latest steps: - name: Set up Google Cloud SDK From cba0fa72b650ff253e46e0233930928f92e10d53 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Fri, 18 Apr 2025 23:59:05 -0400 Subject: [PATCH 25/64] update flow --- .github/workflows/prefect_orchestraiton.yml | 27 ++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index c99162f..2149da8 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -3,10 +3,8 @@ name: Deploy Prefect Flow to Cloud Run on: push: branches: - - "**" # run on commits to main (adjust branch name as needed) - # paths: - # - "prefectWorkflows/**" - workflow_dispatch: # allow manual trigger from the Actions tab if needed + - "**" # run on commits to any branch (adjust as needed) + workflow_dispatch: # allow manual trigger from the Actions tab jobs: build-deploy: @@ -32,7 +30,9 @@ jobs: - name: Build Docker image run: | cd prefectWorkflows - docker build -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest . + docker build \ + -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ + . # Step 5: Push the image to Artifact Registry - name: Push Docker image @@ -45,11 +45,14 @@ jobs: # Step 7: Register/Update Prefect deployment - name: Register Prefect deployment - # Use Prefect CLI to build and apply the deployment run: | - prefect deployment build flow.py:scraperflow -n "ScraperFlow Weekly" \ - --pool my-cloud-run-pool -q default \ - --cron "0 9 * * 6" --skip-upload --apply + prefect deployment build flow.py:scraperflow \ + -n "ScraperFlow Weekly" \ + --pool my-cloud-run-pool \ + -q default \ + --cron "0 9 * * 6" \ + --skip-upload \ + --apply env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} @@ -57,9 +60,11 @@ jobs: # Step 8: (Optional) Confirm success - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." + deploy-worker: - needs: build-and-deploy + needs: build-deploy runs-on: ubuntu-latest + steps: - name: Set up Google Cloud SDK uses: google-github-actions/setup-gcloud@v1 @@ -76,5 +81,5 @@ jobs: --allow-unauthenticated \ --memory 4Gi \ --timeout 3600s \ - --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/",PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} " + --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \ --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run" From 330d92ebe7f4c25a3c762e25f2311fa177c47400 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sat, 19 Apr 2025 00:04:11 -0400 Subject: [PATCH 26/64] update credentials flow --- .github/workflows/prefect_orchestraiton.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 2149da8..92542e3 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -15,11 +15,13 @@ jobs: - name: Checkout code uses: actions/checkout@v3 - # Step 2: Set up gcloud CLI - - name: Set up Google Cloud SDK - uses: google-github-actions/setup-gcloud@v1 + - name: GCP Authentication + uses: google-github-actions/auth@v2 with: credentials_json: "${{ secrets.GCP_KEY }}" + # Step 2: Set up gcloud CLI + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v2 # Step 3: Configure Docker auth for Artifact Registry - name: Docker login for Artifact Registry @@ -66,10 +68,12 @@ jobs: runs-on: ubuntu-latest steps: - - name: Set up Google Cloud SDK - uses: google-github-actions/setup-gcloud@v1 + - name: GCP Authentication + uses: google-github-actions/auth@v2 with: credentials_json: "${{ secrets.GCP_KEY }}" + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v2 - name: Deploy Prefect worker to Cloud Run run: | From 879eec3d27739133ae892c20160c6f0c1afd584c Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sat, 19 Apr 2025 00:08:35 -0400 Subject: [PATCH 27/64] update image --- .github/workflows/prefect_orchestraiton.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 92542e3..01d62eb 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -33,13 +33,13 @@ jobs: run: | cd prefectWorkflows docker build \ - -t ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ + -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ . # Step 5: Push the image to Artifact Registry - name: Push Docker image run: | - docker push ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest + docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest # Step 6: Install Prefect in the runner (for CLI use) - name: Install Prefect From 8a8beb8d3232ac4a5fe33fd5ceb35de2e86cfeaf Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 09:27:23 -0400 Subject: [PATCH 28/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 01d62eb..2e49349 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -48,6 +48,7 @@ jobs: # Step 7: Register/Update Prefect deployment - name: Register Prefect deployment run: | + cd prefectWorkflows prefect deployment build flow.py:scraperflow \ -n "ScraperFlow Weekly" \ --pool my-cloud-run-pool \ From 0cfaf6c65c18f0016f8146a89b4c271f359cc9b2 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 12:06:20 -0400 Subject: [PATCH 29/64] update prefect .yaml --- prefectWorkflows/prefect.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml index ab669e5..c929c2c 100644 --- a/prefectWorkflows/prefect.yaml +++ b/prefectWorkflows/prefect.yaml @@ -13,11 +13,11 @@ build: push: # # pull section allows you to provide instructions for cloning this project in remote locations -# pull: -# - prefect.deployments.steps.git_clone: -# repository: https://github.com/Nikhil-Kudupudi/NUBot.git -# branch: gcs_bucket -# access_token: +pull: +- prefect.deployments.steps.git_clone: + repository: https://github.com/Nikhil-Kudupudi/NUBot.git + branch: docker-deployment + access_token: # the deployments section allows you to provide configuration for deploying flows deployments: @@ -34,14 +34,14 @@ deployments: work_queue_name: job_variables: {} - name: default - version: + version: 1.0.0 tags: [] concurrency_limit: - description: - entrypoint: scraper_flow.py:scraperflow + description: "cloud run prefect flow" + entrypoint: prefectWorkflows/flow.py:scraperflow parameters: {} work_pool: - name: nubot_dataflow - work_queue_name: + name: my-cloud-run-pool + work_queue_name: default job_variables: {} schedules: [] From 88e18b5f93b86e1ebf070e7d0ee1ad10f3fc26ab Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 12:07:34 -0400 Subject: [PATCH 30/64] update the latest command --- .github/workflows/prefect_orchestraiton.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 2e49349..f2d36f8 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -49,13 +49,7 @@ jobs: - name: Register Prefect deployment run: | cd prefectWorkflows - prefect deployment build flow.py:scraperflow \ - -n "ScraperFlow Weekly" \ - --pool my-cloud-run-pool \ - -q default \ - --cron "0 9 * * 6" \ - --skip-upload \ - --apply + prefect deployment prefect.yaml --apply env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} From 340e488544f93a0c58e8d590cc6feec8e4794f8a Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 12:36:53 -0400 Subject: [PATCH 31/64] update command --- .github/workflows/prefect_orchestraiton.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index f2d36f8..1fc6af0 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -49,7 +49,7 @@ jobs: - name: Register Prefect deployment run: | cd prefectWorkflows - prefect deployment prefect.yaml --apply + prefect deploy --apply prefect.yaml env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} From f9ceeac67383296261b330e46717afd633719474 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 12:56:10 -0400 Subject: [PATCH 32/64] update command for prefect deploy --- .github/workflows/prefect_orchestraiton.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 1fc6af0..f52bfbc 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -49,7 +49,14 @@ jobs: - name: Register Prefect deployment run: | cd prefectWorkflows - prefect deploy --apply prefect.yaml + prefect deploy flow.py:scraperflow \ + --name "scraperflow-deployment" \ + --description "ScraperFlow Deployment for scraping tasks" \ + --tag scraper --tag production \ + --cron "0 9 * * 6" \ + --pool my-cloud-run-pool \ + --work-queue default + env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} From 9c048d55dfc4873bc85ece4fbc1831e14a91a90b Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:06:49 -0400 Subject: [PATCH 33/64] create a new flow --- .github/workflows/prefect_orchestraiton.yml | 20 ++++----- prefectWorkflows/prefect.yaml | 47 --------------------- 2 files changed, 10 insertions(+), 57 deletions(-) delete mode 100644 prefectWorkflows/prefect.yaml diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index f52bfbc..53ee8af 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -29,17 +29,17 @@ jobs: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev # Step 4: Build the Docker image - - name: Build Docker image - run: | - cd prefectWorkflows - docker build \ - -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ - . + # - name: Build Docker image + # run: | + # cd prefectWorkflows + # docker build \ + # -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ + # . - # Step 5: Push the image to Artifact Registry - - name: Push Docker image - run: | - docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest + # # Step 5: Push the image to Artifact Registry + # - name: Push Docker image + # run: | + # docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest # Step 6: Install Prefect in the runner (for CLI use) - name: Install Prefect diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml deleted file mode 100644 index c929c2c..0000000 --- a/prefectWorkflows/prefect.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# Welcome to your prefect.yaml file! You can use this file for storing and managing -# configuration for deploying your flows. We recommend committing this file to source -# control along with your flow code. - -# Generic metadata about this project -name: prefectWorkflows -prefect-version: 3.2.15 - -# build section allows you to manage and build docker images -build: - -# push section allows you to manage if and how this project is uploaded to remote locations -push: - -# # pull section allows you to provide instructions for cloning this project in remote locations -pull: -- prefect.deployments.steps.git_clone: - repository: https://github.com/Nikhil-Kudupudi/NUBot.git - branch: docker-deployment - access_token: - -# the deployments section allows you to provide configuration for deploying flows -deployments: -- name: - version: - tags: [] - description: - schedule: {} - flow_name: - entrypoint: - parameters: {} - work_pool: - name: - work_queue_name: - job_variables: {} -- name: default - version: 1.0.0 - tags: [] - concurrency_limit: - description: "cloud run prefect flow" - entrypoint: prefectWorkflows/flow.py:scraperflow - parameters: {} - work_pool: - name: my-cloud-run-pool - work_queue_name: default - job_variables: {} - schedules: [] From 203f0f5dba53fe576b88fe0aefff5dea4f99015a Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:21:20 -0400 Subject: [PATCH 34/64] update test changes --- .github/workflows/prefect_orchestraiton.yml | 2 +- prefectWorkflows/prefect.yaml | 44 +++++++++++++++++++ prefectWorkflows/{flow.py => scraper_flow.py} | 0 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 prefectWorkflows/prefect.yaml rename prefectWorkflows/{flow.py => scraper_flow.py} (100%) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 53ee8af..7dfff7b 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -49,7 +49,7 @@ jobs: - name: Register Prefect deployment run: | cd prefectWorkflows - prefect deploy flow.py:scraperflow \ + prefect deploy scraper_flow.py:scraperflow \ --name "scraperflow-deployment" \ --description "ScraperFlow Deployment for scraping tasks" \ --tag scraper --tag production \ diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml new file mode 100644 index 0000000..a6117a8 --- /dev/null +++ b/prefectWorkflows/prefect.yaml @@ -0,0 +1,44 @@ +# Welcome to your prefect.yaml file! You can use this file for storing and managing +# configuration for deploying your flows. We recommend committing this file to source +# control along with your flow code. + +# Generic metadata about this project +name: prefectWorkflows +prefect-version: 3.2.15 + +# build section allows you to manage and build docker images +# You can leave this empty if not using Docker for deployment +build: {} + +# push section allows you to manage if and how this project is uploaded to remote locations +# Leave this empty if you don't need to push artifacts +push: {} + +# pull section allows you to provide instructions for cloning this project in remote locations +# Remove or uncomment and modify if you need this step for pulling code from GitHub or other places +pull: + - prefect.deployments.steps.git_clone: + repository: https://github.com/Nikhil-Kudupudi/NUBot.git + branch: docker-deployment + access_token: + +# Deployments section allows you to provide configuration for deploying flows +deployments: + - name: "scraperflow-deployment" + version: "1.0.0" # Define a version for your deployment + tags: + - "scraper" + - "production" # Add relevant tags + description: "Deployment for ScraperFlow, scheduled weekly" + schedule: + cron: "0 9 * * 6" # This sets the flow to run every Saturday at 9 AM + flow_name: scraperflow # The name of your flow function + entrypoint: prefectWorkflows/scraper_flow.py:scraperflow # Path to your flow function + parameters: {} # If your flow takes parameters, add them here + work_pool: + name: nubot_dataflow # Specify the name of your work pool + work_queue_name: default # If you have a specific work queue + job_variables: {} # Define any job variables (optional) + + # Example for another deployment, you can add more as needed + diff --git a/prefectWorkflows/flow.py b/prefectWorkflows/scraper_flow.py similarity index 100% rename from prefectWorkflows/flow.py rename to prefectWorkflows/scraper_flow.py From 0b2ce8f22633d9527fe455ebad3ef4675445f803 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:55:26 -0400 Subject: [PATCH 35/64] update url --- .github/workflows/prefect_orchestraiton.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 7dfff7b..577d2ba 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -58,7 +58,7 @@ jobs: --work-queue default env: - PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + PREFECT_API_KEY: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}} PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} # Step 8: (Optional) Confirm success From 21860c3cc9b50eaaaed907b8f251cb804a4d3f95 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 13:57:39 -0400 Subject: [PATCH 36/64] swap keys --- .github/workflows/prefect_orchestraiton.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 577d2ba..a8a87ef 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -58,9 +58,8 @@ jobs: --work-queue default env: - PREFECT_API_KEY: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}} - PREFECT_API_URL: ${{ secrets.PREFECT_API_URL }} - + PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}} # Step 8: (Optional) Confirm success - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." From 845906f358292b1d275b6992b0084a346ba84632 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 18:20:59 -0400 Subject: [PATCH 37/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index a8a87ef..4a5f462 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -29,17 +29,17 @@ jobs: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev # Step 4: Build the Docker image - # - name: Build Docker image - # run: | - # cd prefectWorkflows - # docker build \ - # -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ - # . + - name: Build Docker image + run: | + cd prefectWorkflows + docker build \ + -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ + . - # # Step 5: Push the image to Artifact Registry - # - name: Push Docker image - # run: | - # docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest + # Step 5: Push the image to Artifact Registry + - name: Push Docker image + run: | + docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest # Step 6: Install Prefect in the runner (for CLI use) - name: Install Prefect From 4cea0fb0e2bcca5bc9483657f2b7b4c2cece21df Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 18:54:54 -0400 Subject: [PATCH 38/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 4a5f462..32bb857 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -82,6 +82,7 @@ jobs: --image=prefecthq/prefect:3-latest \ --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \ --no-cpu-throttling \ + --region ${{ secrets.GCP_REGION }} \ --platform managed \ --allow-unauthenticated \ --memory 4Gi \ From 1abb4ed670c0a7897a48e04cc1c670d44a75edb5 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 19:20:49 -0400 Subject: [PATCH 39/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 32bb857..12e59b4 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -80,7 +80,6 @@ jobs: run: | gcloud run deploy prefect-worker \ --image=prefecthq/prefect:3-latest \ - --service-account prefect-sa@${{ secrets.GCP_PROJECT_ID }}.iam.gserviceaccount.com \ --no-cpu-throttling \ --region ${{ secrets.GCP_REGION }} \ --platform managed \ From 3f707fae5b33adfd7b4cf0380b7bcd45cad915e2 Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sat, 19 Apr 2025 19:42:17 -0400 Subject: [PATCH 40/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 25 ++++++--------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 12e59b4..3dce490 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -33,13 +33,13 @@ jobs: run: | cd prefectWorkflows docker build \ - -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest \ + -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ . # Step 5: Push the image to Artifact Registry - name: Push Docker image run: | - docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperfloe:latest + docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest # Step 6: Install Prefect in the runner (for CLI use) - name: Install Prefect @@ -60,31 +60,20 @@ jobs: env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}} - # Step 8: (Optional) Confirm success - - name: Deployment successful - run: echo "Prefect flow deployment updated and scheduled successfully." - - deploy-worker: - needs: build-deploy - runs-on: ubuntu-latest - - steps: - - name: GCP Authentication - uses: google-github-actions/auth@v2 - with: - credentials_json: "${{ secrets.GCP_KEY }}" - - name: Set up Google Cloud SDK - uses: google-github-actions/setup-gcloud@v2 - name: Deploy Prefect worker to Cloud Run run: | gcloud run deploy prefect-worker \ - --image=prefecthq/prefect:3-latest \ + --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ --no-cpu-throttling \ --region ${{ secrets.GCP_REGION }} \ --platform managed \ + --port 5002 \ --allow-unauthenticated \ --memory 4Gi \ --timeout 3600s \ --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \ --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run" + # Step 8: (Optional) Confirm success + - name: Deployment successful + run: echo "Prefect flow deployment updated and scheduled successfully." From 5da0cc9b7049aede6a704995cad5a22b67e16cff Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sun, 20 Apr 2025 01:28:49 -0400 Subject: [PATCH 41/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 3dce490..80a3bc4 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -72,6 +72,8 @@ jobs: --allow-unauthenticated \ --memory 4Gi \ --timeout 3600s \ + --entrypoint=sh \ + --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \ --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \ --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run" # Step 8: (Optional) Confirm success From 6d74ddb80fe8934fd3e1f9580a3d9dc8eb774b9d Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 12:09:13 -0400 Subject: [PATCH 42/64] update flow --- .github/workflows/prefect_orchestraiton.yml | 14 ++++++++++++ prefectWorkflows/Dockerfile | 25 ++++++--------------- prefectWorkflows/scraper_flow.py | 11 ++++++++- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 80a3bc4..4ccb59a 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -79,3 +79,17 @@ jobs: # Step 8: (Optional) Confirm success - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." + deploy-flow: + needs: build-push + runs-on: ubuntu-latest + steps: + - name: Install Prefect + run: pip install prefect==3.* + + - name: Run deploy script + env: + PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }} + PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + run: | + cd prefectWorkflows + python scraper_flow.py diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile index 5d3a8c7..f3c20ef 100644 --- a/prefectWorkflows/Dockerfile +++ b/prefectWorkflows/Dockerfile @@ -1,21 +1,10 @@ -# Use a lightweight Python base image +# Dockerfile FROM python:3.10-slim - -# Set working directory in container +ENV PYTHONUNBUFFERED=1 WORKDIR /opt/prefect/project - -# Copy requirements and install them (if you have a requirements.txt or pyproject.toml) -COPY requirements.txt ./ -RUN pip install -U pip && pip install -r requirements.txt - -# Install Prefect (if not already included in requirements) -RUN pip install prefect==2.* - -# Copy the Prefect flow code and related modules into the image +COPY requirements.txt . +RUN pip install --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt \ + && pip install --no-cache-dir prefect==3.* COPY . . - -# (Optional) If your flow code is a package with setup.py, you could RUN pip install . instead. - -# Set the default command (entrypoint) for the container to do nothing by default. -# Prefect will override this when running the flow, so we keep it simple. -CMD ["python", "-c", "print('Prefect flow container ready')"] +CMD ["python", "-c", "print('ready')"] diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/scraper_flow.py index 2a7ec09..5430fc8 100644 --- a/prefectWorkflows/scraper_flow.py +++ b/prefectWorkflows/scraper_flow.py @@ -3,6 +3,7 @@ from dataflow.chunk_data import chunk_data from dotenv import load_dotenv import os +from prefect.docker import DockerImage load_dotenv(override=True) PREFECT_API_KEY=os.getenv('PREFECT_API_KEY') @@ -30,7 +31,15 @@ def scraperflow(): # push=True # ) try: - scraperflow() + scraperflow.deploy( + name="scraperflow-deployment", + work_pool_name="my-cloud-run-pool", + image=DockerImage( + name="us-docker.pkg.dev/nubot-nikhil/backend-nubot/scraperflow:latest", + platform="linux/amd64", + ), + schedule="0 9 * * 6", + ) except Exception as e: print(e) From 197187c707d1fdfc1d9fbb6759559aeb60d1ba5a Mon Sep 17 00:00:00 2001 From: Nikhil Kudupudi <46317218+Nikhil-Kudupudi@users.noreply.github.com> Date: Sun, 20 Apr 2025 12:12:59 -0400 Subject: [PATCH 43/64] Update prefect_orchestraiton.yml --- .github/workflows/prefect_orchestraiton.yml | 38 ++++++++------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 4ccb59a..f7233b3 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -3,15 +3,14 @@ name: Deploy Prefect Flow to Cloud Run on: push: branches: - - "**" # run on commits to any branch (adjust as needed) - workflow_dispatch: # allow manual trigger from the Actions tab + - "**" # run on commits to any branch (adjust as needed) + workflow_dispatch: # allow manual trigger from the Actions tab jobs: build-deploy: runs-on: ubuntu-latest steps: - # Step 1: Check out repository code - name: Checkout code uses: actions/checkout@v3 @@ -19,16 +18,14 @@ jobs: uses: google-github-actions/auth@v2 with: credentials_json: "${{ secrets.GCP_KEY }}" - # Step 2: Set up gcloud CLI + - name: Set up Google Cloud SDK uses: google-github-actions/setup-gcloud@v2 - # Step 3: Configure Docker auth for Artifact Registry - name: Docker login for Artifact Registry run: | gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev - # Step 4: Build the Docker image - name: Build Docker image run: | cd prefectWorkflows @@ -36,30 +33,26 @@ jobs: -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ . - # Step 5: Push the image to Artifact Registry - name: Push Docker image run: | docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest - # Step 6: Install Prefect in the runner (for CLI use) - name: Install Prefect run: pip install --no-cache prefect==2.* - # Step 7: Register/Update Prefect deployment - name: Register Prefect deployment run: | cd prefectWorkflows prefect deploy scraper_flow.py:scraperflow \ - --name "scraperflow-deployment" \ - --description "ScraperFlow Deployment for scraping tasks" \ - --tag scraper --tag production \ - --cron "0 9 * * 6" \ - --pool my-cloud-run-pool \ - --work-queue default - + --name "scraperflow-deployment" \ + --description "ScraperFlow Deployment for scraping tasks" \ + --tag scraper --tag production \ + --cron "0 9 * * 6" \ + --pool my-cloud-run-pool \ + --work-queue default env: PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} - PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{secrets.PREFECT_ACCOUNT_ID}}/workspaces/${{secrets.PREFECT_WORKSPACE_ID}} + PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}" - name: Deploy Prefect worker to Cloud Run run: | @@ -68,27 +61,26 @@ jobs: --no-cpu-throttling \ --region ${{ secrets.GCP_REGION }} \ --platform managed \ - --port 5002 \ --allow-unauthenticated \ --memory 4Gi \ --timeout 3600s \ --entrypoint=sh \ --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \ - --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} \ - --args "prefect","worker","start","--pool","my-cloud-run-pool","-t","cloud-run" - # Step 8: (Optional) Confirm success + --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} - name: Deployment successful run: echo "Prefect flow deployment updated and scheduled successfully." + deploy-flow: - needs: build-push + needs: build-deploy runs-on: ubuntu-latest + steps: - name: Install Prefect run: pip install prefect==3.* - name: Run deploy script env: - PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }} + PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}" PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} run: | cd prefectWorkflows From bd0f1f26e6fb4601d238b0e806e0197af86c4b96 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 13:02:49 -0400 Subject: [PATCH 44/64] update --- .github/workflows/prefect_orchestraiton.yml | 106 ++++++++++---------- prefectWorkflows/.env | 2 +- prefectWorkflows/dataflow/chunk_data.py | 2 +- prefectWorkflows/dataflow/scraper.py | 2 +- 4 files changed, 56 insertions(+), 56 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index f7233b3..cc55b59 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -1,87 +1,87 @@ -name: Deploy Prefect Flow to Cloud Run +name: CI/CD Prefect Flow & Worker to Cloud Run on: push: branches: - - "**" # run on commits to any branch (adjust as needed) - workflow_dispatch: # allow manual trigger from the Actions tab + - "**" # run on commits to any branch (adjust as needed) + workflow_dispatch: # allow manual trigger jobs: - build-deploy: + build-and-deploy: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: GCP Authentication uses: google-github-actions/auth@v2 with: - credentials_json: "${{ secrets.GCP_KEY }}" + credentials_json: ${{ secrets.GCP_KEY }} - - name: Set up Google Cloud SDK + - name: Setup gcloud CLI uses: google-github-actions/setup-gcloud@v2 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} - - name: Docker login for Artifact Registry - run: | - gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + - name: Docker login to Artifact Registry + run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev - - name: Build Docker image + - name: Build & Push Docker image run: | cd prefectWorkflows docker build \ -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ . - - - name: Push Docker image - run: | docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest - - name: Install Prefect - run: pip install --no-cache prefect==2.* + - name: Setup Python & Prefect CLI + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install prefect==3.* - - name: Register Prefect deployment - run: | - cd prefectWorkflows - prefect deploy scraper_flow.py:scraperflow \ - --name "scraperflow-deployment" \ - --description "ScraperFlow Deployment for scraping tasks" \ - --tag scraper --tag production \ - --cron "0 9 * * 6" \ - --pool my-cloud-run-pool \ - --work-queue default - env: - PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} - PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}" + - name: Prefect Auth & Deploy + uses: PrefectHQ/actions-prefect-auth@v1 + with: + prefect-api-key: ${{ secrets.PREFECT_API_KEY }} + prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_ID }} + - uses: PrefectHQ/actions-prefect-deploy@v4 + with: + deployment-names: scraperflow-deployment + requirements-file-paths: requirements.txt + deployment-file-path: prefect.yaml - - name: Deploy Prefect worker to Cloud Run - run: | - gcloud run deploy prefect-worker \ - --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - --no-cpu-throttling \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --allow-unauthenticated \ - --memory 4Gi \ - --timeout 3600s \ - --entrypoint=sh \ - --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \ - --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL=${{ secrets.PREFECT_API_URL }},PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} - name: Deployment successful - run: echo "Prefect flow deployment updated and scheduled successfully." + run: echo "Prefect flow deployed ✔️" - deploy-flow: - needs: build-deploy + deploy-worker: + needs: build-and-deploy runs-on: ubuntu-latest steps: - - name: Install Prefect - run: pip install prefect==3.* + - name: GCP Authentication + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_KEY }} - - name: Run deploy script - env: - PREFECT_API_URL: "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}" - PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + - name: Setup gcloud CLI + uses: google-github-actions/setup-gcloud@v2 + with: + project_id: ${{ secrets.GCP_PROJECT_ID }} + + - name: Deploy Prefect Worker to Cloud Run run: | - cd prefectWorkflows - python scraper_flow.py + gcloud run deploy prefect-worker \ + --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ + --platform managed \ + --region ${{ secrets.GCP_REGION }} \ + --allow-unauthenticated \ + --no-cpu-throttling \ + --min-instances=1 \ + --timeout=3600s \ + --memory=4Gi \ + --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \ + --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} + - name: Worker deployed + run: echo "Prefect worker deployed ✔️" diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env index 0b2b74e..2e9444b 100644 --- a/prefectWorkflows/.env +++ b/prefectWorkflows/.env @@ -8,4 +8,4 @@ RAW_DATA_FOLDER=raw_data FAISS_INDEX_FOLDER=faiss_index GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json" PREFECT_API_KEY=pnu_VuQWNSlXmc2Hqknf -PREFECT_API_URL="https://api.prefect.cloud/api/accounts-8a55-446a-ac46-80a3f843d8b6" \ No newline at end of file +PREFECT_API_URL="https://api.prefect.cloud/api/accounts/806f2e07-5063-4fbe-9b46-0545ad5de2d1/workspaces/acdf9e9e-8a55-446a-ac46-80a3f843d8b6" diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py index 265ce1a..fc35e37 100644 --- a/prefectWorkflows/dataflow/chunk_data.py +++ b/prefectWorkflows/dataflow/chunk_data.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv from google.cloud.storage import Client -from store_data import upload_faiss_index_to_bucket +from dataflow.store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') from google.auth import default diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py index 0003587..4676a4a 100644 --- a/prefectWorkflows/dataflow/scraper.py +++ b/prefectWorkflows/dataflow/scraper.py @@ -7,7 +7,7 @@ from urllib.parse import urljoin, urlparse from dotenv import load_dotenv import hashlib -from store_data import upload_many_blobs_with_transfer_manager +from dataflow.store_data import upload_many_blobs_with_transfer_manager load_dotenv(override=True) # Configuration URLS_LIST=list(os.getenv('URLS_LIST','').split(",")) From 82d458289113d217d320ec540ef776c2407d03b3 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 13:11:41 -0400 Subject: [PATCH 45/64] add prefect workspace name --- .github/workflows/prefect_orchestraiton.yml | 28 ++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index cc55b59..c237d0a 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -19,21 +19,21 @@ jobs: with: credentials_json: ${{ secrets.GCP_KEY }} - - name: Setup gcloud CLI - uses: google-github-actions/setup-gcloud@v2 - with: - project_id: ${{ secrets.GCP_PROJECT_ID }} + # - name: Setup gcloud CLI + # uses: google-github-actions/setup-gcloud@v2 + # with: + # project_id: ${{ secrets.GCP_PROJECT_ID }} - - name: Docker login to Artifact Registry - run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + # - name: Docker login to Artifact Registry + # run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev - - name: Build & Push Docker image - run: | - cd prefectWorkflows - docker build \ - -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - . - docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest + # - name: Build & Push Docker image + # run: | + # cd prefectWorkflows + # docker build \ + # -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ + # . + # docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest - name: Setup Python & Prefect CLI uses: actions/setup-python@v5 @@ -45,7 +45,7 @@ jobs: uses: PrefectHQ/actions-prefect-auth@v1 with: prefect-api-key: ${{ secrets.PREFECT_API_KEY }} - prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_ID }} + prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_NAME }} - uses: PrefectHQ/actions-prefect-deploy@v4 with: deployment-names: scraperflow-deployment From 1d8ac748225e8e70ccc09a2a3f2c3d809a90f8d2 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 14:50:24 -0400 Subject: [PATCH 46/64] update test flow --- .github/workflows/github-orchestration.yml | 0 .github/workflows/prefect_orchestraiton.yml | 127 +++++++++++--------- prefectWorkflows/Dockerfile | 36 ++++-- prefectWorkflows/scraper_flow.py | 18 +-- 4 files changed, 101 insertions(+), 80 deletions(-) create mode 100644 .github/workflows/github-orchestration.yml diff --git a/.github/workflows/github-orchestration.yml b/.github/workflows/github-orchestration.yml new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index c237d0a..4af72d8 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -1,87 +1,98 @@ -name: CI/CD Prefect Flow & Worker to Cloud Run +name: Deploy Prefect Flow to Cloud Run on: push: branches: - - "**" # run on commits to any branch (adjust as needed) - workflow_dispatch: # allow manual trigger + - main # or any branches you want to trigger the workflow + workflow_dispatch: # allows manual triggering + +env: + PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }} + PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} jobs: build-and-deploy: runs-on: ubuntu-latest - steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install prefect==2.* python-dotenv - name: GCP Authentication uses: google-github-actions/auth@v2 with: - credentials_json: ${{ secrets.GCP_KEY }} - - # - name: Setup gcloud CLI - # uses: google-github-actions/setup-gcloud@v2 - # with: - # project_id: ${{ secrets.GCP_PROJECT_ID }} + credentials_json: "${{ secrets.GCP_KEY }}" - # - name: Docker login to Artifact Registry - # run: gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v2 - # - name: Build & Push Docker image - # run: | - # cd prefectWorkflows - # docker build \ - # -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - # . - # docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest + - name: Configure Docker for Artifact Registry + run: | + gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev - - name: Setup Python & Prefect CLI - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: pip install prefect==3.* + - name: Create Prefect Cloud Run pool if not exists + run: | + # Check if pool exists, create if it doesn't + if ! prefect pool ls | grep -q "my-cloud-run-pool"; then + prefect pool create my-cloud-run-pool --type cloud-run + fi - - name: Prefect Auth & Deploy - uses: PrefectHQ/actions-prefect-auth@v1 - with: - prefect-api-key: ${{ secrets.PREFECT_API_KEY }} - prefect-workspace: ${{ secrets.PREFECT_WORKSPACE_NAME }} - - uses: PrefectHQ/actions-prefect-deploy@v4 - with: - deployment-names: scraperflow-deployment - requirements-file-paths: requirements.txt - deployment-file-path: prefect.yaml + # Check if work queue exists, create if it doesn't + if ! prefect work-queue ls | grep -q "default"; then + prefect work-queue create default --pool my-cloud-run-pool + fi - - name: Deployment successful - run: echo "Prefect flow deployed ✔️" + - name: Build Docker image + run: | + cd prefectWorkflows + docker build \ + -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ + . - deploy-worker: - needs: build-and-deploy - runs-on: ubuntu-latest + - name: Push Docker image + run: | + docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest - steps: - - name: GCP Authentication - uses: google-github-actions/auth@v2 - with: - credentials_json: ${{ secrets.GCP_KEY }} + - name: Verify Prefect Connectivity + run: | + prefect cloud login -k ${{secrets.PREFECT_API_KEY}} + prefect config set PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} + prefect config set PREFECT_API_URL=${{ secrets.PREFECT_API_URL }} + prefect cloud workspace ls - - name: Setup gcloud CLI - uses: google-github-actions/setup-gcloud@v2 - with: - project_id: ${{ secrets.GCP_PROJECT_ID }} + # If that succeeded, proceed with deployment + - name: Register Prefect deployment + run: | + cd prefectWorkflows + prefect deploy scraper_flow.py:scraperflow \ + --name "scraperflow-deployment" \ + --description "ScraperFlow Deployment for scraping tasks" \ + --tag scraper --tag production \ + --cron "0 9 * * 6" \ + --pool my-cloud-run-pool \ + --work-queue default - - name: Deploy Prefect Worker to Cloud Run + - name: Deploy Prefect worker to Cloud Run run: | gcloud run deploy prefect-worker \ --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - --platform managed \ --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --port 8080 \ --allow-unauthenticated \ - --no-cpu-throttling \ - --min-instances=1 \ - --timeout=3600s \ - --memory=4Gi \ - --args="-c,prefect worker start --pool my-cloud-run-pool -t cloud-run & python3 -m http.server \$PORT" \ + --memory 4Gi \ + --cpu 2 \ + --timeout 3600s \ + --concurrency 80 \ --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} - - name: Worker deployed - run: echo "Prefect worker deployed ✔️" + - name: Deployment successful + run: echo "Prefect flow deployment updated and scheduled successfully." diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile index f3c20ef..f33c503 100644 --- a/prefectWorkflows/Dockerfile +++ b/prefectWorkflows/Dockerfile @@ -1,10 +1,32 @@ -# Dockerfile FROM python:3.10-slim -ENV PYTHONUNBUFFERED=1 -WORKDIR /opt/prefect/project + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements file COPY requirements.txt . -RUN pip install --upgrade pip \ - && pip install --no-cache-dir -r requirements.txt \ - && pip install --no-cache-dir prefect==3.* + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the application code COPY . . -CMD ["python", "-c", "print('ready')"] + +# Set environment variables +ENV PYTHONUNBUFFERED=1 + +# Default command to run the Prefect worker +CMD ["prefect", "worker", "start", "--pool", "my-cloud-run-pool", "--type", "cloud-run"] + +# Expose the port that Cloud Run expects +EXPOSE 8080 + +# Health check endpoint for Cloud Run +HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8080/ || exit 1 \ No newline at end of file diff --git a/prefectWorkflows/scraper_flow.py b/prefectWorkflows/scraper_flow.py index 5430fc8..c2d7602 100644 --- a/prefectWorkflows/scraper_flow.py +++ b/prefectWorkflows/scraper_flow.py @@ -24,22 +24,10 @@ def scraperflow(): if __name__ == "__main__": # # Run the flow -## for cloud - # scraperflow.deploy(name="my-first-deployment", - # work_pool_name="dataflow", - # image='prefecthq/prefect:2-python3.10', - # push=True - # ) + try: - scraperflow.deploy( - name="scraperflow-deployment", - work_pool_name="my-cloud-run-pool", - image=DockerImage( - name="us-docker.pkg.dev/nubot-nikhil/backend-nubot/scraperflow:latest", - platform="linux/amd64", - ), - schedule="0 9 * * 6", - ) + + scraperflow() except Exception as e: print(e) From 771d9a5a6819b8f8a24a1bcc94627f581767ef87 Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 14:56:19 -0400 Subject: [PATCH 47/64] test --- .github/workflows/prefect_orchestraiton.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index 4af72d8..a90a692 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -3,7 +3,7 @@ name: Deploy Prefect Flow to Cloud Run on: push: branches: - - main # or any branches you want to trigger the workflow + - "**" # or any branches you want to trigger the workflow workflow_dispatch: # allows manual triggering env: From 5be490997ebea41ffeec31710db70cb9eac8c20e Mon Sep 17 00:00:00 2001 From: Nikhil Date: Sun, 20 Apr 2025 15:32:49 -0400 Subject: [PATCH 48/64] update test 2 --- .github/workflows/prefect_orchestraiton.yml | 120 ++++++++------------ prefectWorkflows/Dockerfile | 40 +++---- prefectWorkflows/prefect.yaml | 56 +++------ prefectWorkflows/scraper_flow.py | 55 ++++----- 4 files changed, 108 insertions(+), 163 deletions(-) diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml index a90a692..a322211 100644 --- a/.github/workflows/prefect_orchestraiton.yml +++ b/.github/workflows/prefect_orchestraiton.yml @@ -2,13 +2,7 @@ name: Deploy Prefect Flow to Cloud Run on: push: - branches: - - "**" # or any branches you want to trigger the workflow - workflow_dispatch: # allows manual triggering - -env: - PREFECT_API_URL: https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }} - PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} + branches: [main] # Trigger on push to main (adjust as needed) jobs: build-and-deploy: @@ -17,82 +11,62 @@ jobs: - name: Checkout code uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v4 + # Authenticate to Google Cloud using the service account JSON key + - name: Set up gcloud CLI + uses: google-github-actions/setup-gcloud@v1 with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install prefect==2.* python-dotenv - - - name: GCP Authentication - uses: google-github-actions/auth@v2 - with: - credentials_json: "${{ secrets.GCP_KEY }}" - - - name: Set up Google Cloud SDK - uses: google-github-actions/setup-gcloud@v2 + service_account_key: ${{ secrets.GCP_KEY }} + project_id: ${{ secrets.GCP_PROJECT_ID }} + export_default_credentials: true - - name: Configure Docker for Artifact Registry + - name: Configure Docker auth for Artifact Registry run: | - gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev - - - name: Create Prefect Cloud Run pool if not exists - run: | - # Check if pool exists, create if it doesn't - if ! prefect pool ls | grep -q "my-cloud-run-pool"; then - prefect pool create my-cloud-run-pool --type cloud-run - fi - - # Check if work queue exists, create if it doesn't - if ! prefect work-queue ls | grep -q "default"; then - prefect work-queue create default --pool my-cloud-run-pool - fi + gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev -q + # The above command logs Docker into Artifact Registry​:contentReference[oaicite:10]{index=10} + # using the gcloud credentials (no interactive prompt due to -q). - name: Build Docker image run: | - cd prefectWorkflows - docker build \ - -t ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - . + IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}//prefect-scraper:latest" + echo "Building image $IMAGE_URI" + docker build -t "$IMAGE_URI" -f prefectWorkflows/Dockerfile . + # Note: The context is the repository root (.), adjust path to Dockerfile if needed. - - name: Push Docker image + - name: Push Docker image to Artifact Registry run: | - docker push ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest + IMAGE_URI="${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}//prefect-scraper:latest" + docker push "$IMAGE_URI" + # After this step, the image is available in Artifact Registry for Cloud Run to use. - - name: Verify Prefect Connectivity - run: | - prefect cloud login -k ${{secrets.PREFECT_API_KEY}} - prefect config set PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} - prefect config set PREFECT_API_URL=${{ secrets.PREFECT_API_URL }} - prefect cloud workspace ls + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install Prefect + run: pip install prefect==3.1.10 - # If that succeeded, proceed with deployment - - name: Register Prefect deployment + - name: Authenticate with Prefect Cloud + env: + PREFECT_API_KEY: ${{ secrets.PREFECT_API_KEY }} run: | - cd prefectWorkflows - prefect deploy scraper_flow.py:scraperflow \ - --name "scraperflow-deployment" \ - --description "ScraperFlow Deployment for scraping tasks" \ - --tag scraper --tag production \ - --cron "0 9 * * 6" \ - --pool my-cloud-run-pool \ - --work-queue default + # Use Prefect CLI to log in to Prefect Cloud non-interactively + prefect cloud login -k $PREFECT_API_KEY -w "${{ secrets.PREFECT_WORKSPACE }}" || { + # Fallback: manually set API URL and API Key if the above doesn't work + echo "Using manual Prefect Cloud authentication..." + prefect config set PREFECT_API_URL="https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}" + prefect config set PREFECT_API_KEY="${{ secrets.PREFECT_API_KEY }}" + } + # This step authenticates the CLI with Prefect Cloud. + # We first attempt `prefect cloud login` (with API key and workspace) for convenience. + # If that fails (for example, if workspace name flag is not supported non-interactively), + # we fall back to setting PREFECT_API_URL and PREFECT_API_KEY directly​:contentReference[oaicite:11]{index=11}. + # The PREFECT_API_URL uses your Account ID and Workspace ID (from secrets) + # to target the correct workspace. This avoids "401 Unauthorized" due to wrong API URL. - - name: Deploy Prefect worker to Cloud Run + - name: Deploy Prefect flow run: | - gcloud run deploy prefect-worker \ - --image=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/scraperflow:latest \ - --region ${{ secrets.GCP_REGION }} \ - --platform managed \ - --port 8080 \ - --allow-unauthenticated \ - --memory 4Gi \ - --cpu 2 \ - --timeout 3600s \ - --concurrency 80 \ - --set-env-vars AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST=https://www.khoury.northeastern.edu/,PREFECT_API_URL= "https://api.prefect.cloud/api/accounts/${{ secrets.PREFECT_ACCOUNT_ID }}/workspaces/${{ secrets.PREFECT_WORKSPACE_ID }}",PREFECT_API_KEY=${{ secrets.PREFECT_API_KEY }} - - name: Deployment successful - run: echo "Prefect flow deployment updated and scheduled successfully." + cd prefectWorkflows # navigate to the folder containing prefect.yaml + prefect deploy -n scraperflow-deployment + # The -n flag ensures we deploy the specific deployment by name (optional if only one deployment in YAML). + # This command reads prefect.yaml and registers/updates the deployment in Prefect Cloud. diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile index f33c503..ea2d944 100644 --- a/prefectWorkflows/Dockerfile +++ b/prefectWorkflows/Dockerfile @@ -1,32 +1,24 @@ +# Start from a lightweight Python image (use the appropriate Python version) FROM python:3.10-slim +# Set working directory in container WORKDIR /app -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ - git \ - && rm -rf /var/lib/apt/lists/* +# Install Python dependencies. +# If you have a requirements.txt, copy and install it: +COPY requirements.txt . +RUN pip install -r requirements.txt -# Copy requirements file -COPY requirements.txt . +# (Alternatively, directly install Prefect and any needed libraries) +# RUN pip install prefect==3.1.10 -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt +# Copy the Prefect flow code and the dataflow module into the image +COPY prefectWorkflows/ /app/prefectWorkflows/ +COPY dataflow/ /app/dataflow/ -# Copy the application code -COPY . . +# Ensure Python can find the 'dataflow' module (add /app to PYTHONPATH) +ENV PYTHONPATH="/app:${PYTHONPATH}" -# Set environment variables -ENV PYTHONUNBUFFERED=1 - -# Default command to run the Prefect worker -CMD ["prefect", "worker", "start", "--pool", "my-cloud-run-pool", "--type", "cloud-run"] - -# Expose the port that Cloud Run expects -EXPOSE 8080 - -# Health check endpoint for Cloud Run -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8080/ || exit 1 \ No newline at end of file +# (Optional) Set a default command (Prefect Cloud will override this when submitting the flow run) +# By default, do nothing or use a generic command. Prefect Cloud's work pool will specify the entrypoint at runtime. +CMD ["python", "-c", "print('Container built for Prefect flow execution')"] diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml index a6117a8..70e2399 100644 --- a/prefectWorkflows/prefect.yaml +++ b/prefectWorkflows/prefect.yaml @@ -1,44 +1,20 @@ -# Welcome to your prefect.yaml file! You can use this file for storing and managing -# configuration for deploying your flows. We recommend committing this file to source -# control along with your flow code. +# Prefect deployment configuration for the scraper_flow +name: scraper-flow-project # Name of the project (can be any identifier for your reference) +prefect-version: 3.1.10 # Prefect version to use for this deployment (match your Prefect 3.x version) -# Generic metadata about this project -name: prefectWorkflows -prefect-version: 3.2.15 - -# build section allows you to manage and build docker images -# You can leave this empty if not using Docker for deployment -build: {} - -# push section allows you to manage if and how this project is uploaded to remote locations -# Leave this empty if you don't need to push artifacts -push: {} - -# pull section allows you to provide instructions for cloning this project in remote locations -# Remove or uncomment and modify if you need this step for pulling code from GitHub or other places -pull: - - prefect.deployments.steps.git_clone: - repository: https://github.com/Nikhil-Kudupudi/NUBot.git - branch: docker-deployment - access_token: - -# Deployments section allows you to provide configuration for deploying flows deployments: - - name: "scraperflow-deployment" - version: "1.0.0" # Define a version for your deployment - tags: - - "scraper" - - "production" # Add relevant tags - description: "Deployment for ScraperFlow, scheduled weekly" + - name: scraperflow-deployment # Name of this deployment (appears in Prefect UI) + description: "Scrapes all URLs and segments data every Saturday at 9:00 UTC" + entrypoint: scraper_flow.py:scraperflow # Entry point to the flow: "