diff --git a/.github/workflows/backend-docker-image-build.yml b/.github/workflows/backend-docker-image-build.yml index f26e491..54e8976 100644 --- a/.github/workflows/backend-docker-image-build.yml +++ b/.github/workflows/backend-docker-image-build.yml @@ -4,7 +4,8 @@ on: push: branches: - "**" - + paths: + - 'services/backend/**' jobs: backend_build: runs-on: ubuntu-latest @@ -31,3 +32,14 @@ jobs: docker build -t $IMAGE . docker push $IMAGE cd ../.. + + - name: Deploy to Cloud Run + run: | + gcloud run deploy backend-service \ + --source services/backend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 4Gi \ + --timeout 3600s \ + --set-env-vars "AIRFLOW_UID=5000,BASE_URL=https://www.khoury.northeastern.edu/,MAX_DEPTH=3,CONCURRENT_REQUESTS=10,DATA_FOLDER=scraped_data,MISTRAL_API_KEY=${{ secrets.MISTRAL_API_KEY }},MLFLOW_TRACKING_URI=${{ secrets.MLFLOW_TRACKING_URI }},BUCKET_NAME=${{ secrets.BUCKET_NAME }},RAW_DATA_FOLDER=raw_data,FAISS_INDEX_FOLDER=faiss_index,URLS_LIST= "https://www.khoury.northeastern.edu/"" diff --git a/.github/workflows/frontend-docker-image-build.yml b/.github/workflows/frontend-docker-image-build.yml index 537cd3c..abd2475 100644 --- a/.github/workflows/frontend-docker-image-build.yml +++ b/.github/workflows/frontend-docker-image-build.yml @@ -4,6 +4,8 @@ on: push: branches: - "**" + paths: + - 'services/frontend/**' jobs: frontend_build: @@ -31,3 +33,14 @@ jobs: docker build -t $IMAGE . docker push $IMAGE cd ../.. + + - name: Deploy to Cloud Run + run: | + gcloud run deploy frontend-service \ + --source services/frontend \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 1Gi \ + --timeout 1800s \ + --set-env-vars "API_URL=https://backend-service-273412-default.run.app/NuBot/" diff --git a/.github/workflows/frontend1-docker-image-build.yml b/.github/workflows/frontend1-docker-image-build.yml new file mode 100644 index 0000000..148f7ab --- /dev/null +++ b/.github/workflows/frontend1-docker-image-build.yml @@ -0,0 +1,45 @@ +name: "build_reactfrontned_image" + +on: + push: + branches: + - "**" + paths: + - "services/frontend1/**" +jobs: + backend_build: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: GCP Authentication + uses: google-github-actions/auth@v2 + with: + credentials_json: "${{ secrets.GCP_KEY }}" + + - name: Setup gcloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Docker login for Artifact Registry + run: | + gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + + - name: Build and Push Backend Image + run: | + cd services/frontend1 + IMAGE=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/react-service:latest + docker build -t $IMAGE . + docker push $IMAGE + cd ../.. + + - name: Deploy to Cloud Run + run: | + gcloud run deploy react-service \ + --source services/frontend1 \ + --region ${{ secrets.GCP_REGION }} \ + --platform managed \ + --allow-unauthenticated \ + --memory 1Gi \ + --timeout 3600s \ + --set-env-vars "REACT_APP_API_URL=${{secrets.REACT_APP_API_URL}}" diff --git a/.github/workflows/github-orchestration.yml b/.github/workflows/github-orchestration.yml new file mode 100644 index 0000000..e69de29 diff --git a/.github/workflows/prefect_orchestraiton.yml b/.github/workflows/prefect_orchestraiton.yml new file mode 100644 index 0000000..36116f7 --- /dev/null +++ b/.github/workflows/prefect_orchestraiton.yml @@ -0,0 +1,59 @@ +name: Deploy Prefect Flow to Cloud Run + +on: + push: + branches: ["**"] # Trigger on push to main (adjust as needed) + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + # Authenticate to Google Cloud using the service account JSON key + - name: GCP Authentication + uses: google-github-actions/auth@v2 + with: + credentials_json: "${{ secrets.GCP_KEY }}" + + - name: Setup gcloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Docker login for Artifact Registry + run: | + gcloud auth configure-docker ${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev + + - name: Build Docker image + run: | + cd prefectWorkflows + IMAGE_URI=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest + echo "Building image $IMAGE_URI" + docker build -t "$IMAGE_URI" . + # Note: The context is the repository root (.), adjust path to Dockerfile if needed. + + - name: Push Docker image to Artifact Registry + run: | + IMAGE_URI=${{ secrets.GCP_ARTIFACT_REGISTRY_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/backend-nubot/prefect-scraper:latest + docker push "$IMAGE_URI" + # After this step, the image is available in Artifact Registry for Cloud Run to use. + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install Prefect3 + run: pip install --no-cache-dir "prefect>=3.2.4" + + - name: Deploy via prefect deploy + run: | + cd prefectWorkflows + prefect deploy -n scraper-cron-deployment # tell pool to use latest image + + - name: Deploy Prefect flow + run: | + cd prefectWorkflows # navigate to the folder containing prefect.yaml + prefect deploy -n scraperflow-deployment + # The -n flag ensures we deploy the specific deployment by name (optional if only one deployment in YAML). + # This command reads prefect.yaml and registers/updates the deployment in Prefect Cloud. diff --git a/.gitignore b/.gitignore index 71c9eff..166527d 100644 --- a/.gitignore +++ b/.gitignore @@ -177,6 +177,7 @@ mlflow-artifacts/ # PyPI configuration file .pypirc -*.html + *.json +!package.json diff --git a/airflow/dags/dataflow/chunk_data.py b/airflow/dags/dataflow/chunk_data.py index fc730d4..30f3ab4 100644 --- a/airflow/dags/dataflow/chunk_data.py +++ b/airflow/dags/dataflow/chunk_data.py @@ -11,7 +11,8 @@ from dataflow.store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory diff --git a/airflow/dags/dataflow/scraper.py b/airflow/dags/dataflow/scraper.py index 9a0e5f9..d426ece 100644 --- a/airflow/dags/dataflow/scraper.py +++ b/airflow/dags/dataflow/scraper.py @@ -13,7 +13,8 @@ BASE_URL = os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests -GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ') +from google.auth import default +credentials, project = default() # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): diff --git a/airflow/dags/dataflow/store_data.py b/airflow/dags/dataflow/store_data.py index d142be3..95ffa68 100644 --- a/airflow/dags/dataflow/store_data.py +++ b/airflow/dags/dataflow/store_data.py @@ -5,7 +5,8 @@ BUCKET_NAME= os.getenv('BUCKET_NAME') RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +credentials, project = default() def get_blob_from_bucket(): storage_client = Client() diff --git a/prefectWorkflows/.dockerignore b/prefectWorkflows/.dockerignore new file mode 100644 index 0000000..3168a00 --- /dev/null +++ b/prefectWorkflows/.dockerignore @@ -0,0 +1,3 @@ +*.env +scraped_data/ +faiss_index/ \ No newline at end of file diff --git a/prefectWorkflows/.env b/prefectWorkflows/.env index 29ab14c..32d03ba 100644 --- a/prefectWorkflows/.env +++ b/prefectWorkflows/.env @@ -6,4 +6,6 @@ DATA_FOLDER = "scraped_data" BUCKET_NAME=scraped_raw_data_nubot RAW_DATA_FOLDER=raw_data FAISS_INDEX_FOLDER=faiss_index -GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json" \ No newline at end of file +GOOGLE_APPLICATION_CREDENTIALS="E:/gcpkeys/nubot/nubot-nikhil-6adeee091d55.json" +PREFECT_API_KEY=pnu_mRGcrBkC9qyFbwGfgrVbjbOoL7WIZ411TKYp +PREFECT_API_URL="https://api.prefect.cloud/api/accounts/806f2e07-5063-4fbe-9b46-0545ad5de2d1/workspaces/acdf9e9e-8a55-446a-ac46-80a3f843d8b6" diff --git a/prefectWorkflows/Dockerfile b/prefectWorkflows/Dockerfile new file mode 100644 index 0000000..dfc7b42 --- /dev/null +++ b/prefectWorkflows/Dockerfile @@ -0,0 +1,25 @@ +# Start from a lightweight Python image (use the appropriate Python version) +FROM python:3.10-slim + +# Set working directory in container +WORKDIR /app + +# Install Python dependencies. +# If you have a requirements.txt, copy and install it: +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +# (Alternatively, directly install Prefect and any needed libraries) +# RUN pip install prefect==3.1.10 + +# Copy the Prefect flow code and the dataflow module into the image +COPY . . + + +# Ensure Python can find the 'dataflow' module (add /app to PYTHONPATH) +ENV PYTHONPATH="/app:${PYTHONPATH}" + +# (Optional) Set a default command (Prefect Cloud will override this when submitting the flow run) +# By default, do nothing or use a generic command. Prefect Cloud's work pool will specify the entrypoint at runtime. +CMD ["python", "-c", "print('Container built for Prefect flow execution')"] diff --git a/prefectWorkflows/dataflow/chunk_data.py b/prefectWorkflows/dataflow/chunk_data.py index 643dcba..fc35e37 100644 --- a/prefectWorkflows/dataflow/chunk_data.py +++ b/prefectWorkflows/dataflow/chunk_data.py @@ -11,7 +11,20 @@ from dataflow.store_data import upload_faiss_index_to_bucket load_dotenv(override=True) BUCKET_NAME= os.getenv('BUCKET_NAME') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') def chunk_data(): # Load all JSON files from a directory @@ -52,4 +65,4 @@ def chunk_data(): if __name__=="__main__": chunk_data() - + upload_faiss_index_to_bucket() \ No newline at end of file diff --git a/prefectWorkflows/dataflow/rag_model.py b/prefectWorkflows/dataflow/rag_model.py deleted file mode 100644 index ffcfa6c..0000000 --- a/prefectWorkflows/dataflow/rag_model.py +++ /dev/null @@ -1,149 +0,0 @@ -from functools import lru_cache -from langchain import hub -from langchain_core.documents import Document -from langgraph.graph import START, StateGraph -from typing_extensions import List, TypedDict -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain.chat_models import init_chat_model -from langchain_community.vectorstores import FAISS -import getpass -import os -from dotenv import load_dotenv -import mlflow -import time -from langfair.auto import AutoEval -import asyncio -# Load the FAISS index -from google.cloud.storage import Client -import tempfile -import os -load_dotenv(override=True) -mlflow.langchain.autolog() -MLFLOW_TRACKING_URI =os.environ.get("MLFLOW_TRACKING_URI") -MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") -FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) # Remote MLflow Server -mlflow.set_experiment("rag_experiment") -if not os.environ.get("MISTRAL_API_KEY"): - os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ") - -@lru_cache(maxsize=None) -def get_llm(): - llm = init_chat_model("mistral-large-latest", model_provider="mistralai") - return llm - -@lru_cache(maxsize=None) -def get_prompt(): -# Define prompt for question-answering - prompt = hub.pull("rlm/rag-prompt") - return prompt - - -# Define state for application -class State(TypedDict): - question: str - context: List[Document] - answer: str - - -@lru_cache(maxsize=None) -def load_embeddings(): - embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") - return embeddings - - - -# Initialize GCS client -storage_client = Client() -bucket=storage_client.bucket(os.getenv('BUCKET_NAME')) -embeddings=load_embeddings() -if not os.path.exists(FAISS_INDEX_FOLDER): - os.makedirs(FAISS_INDEX_FOLDER, exist_ok=True) -# Create a temporary directory -# Download FAISS index files from bucket to FAISS_INDEX_FOLDER directory -for blob in bucket.list_blobs(prefix=FAISS_INDEX_FOLDER): - # Extract just the filename from the full path - filename = os.path.basename(blob.name) - local_path = os.path.join(FAISS_INDEX_FOLDER, filename) - blob.download_to_filename(local_path) - -# Load FAISS index from directory -vector_store = FAISS.load_local(FAISS_INDEX_FOLDER, embeddings, allow_dangerous_deserialization=True) -# Define application steps -def retrieve(state: State): - with mlflow.start_run(nested=True, run_name="retrieval"): - start_time = time.time() - retrieved_docs = vector_store.similarity_search(state["question"]) - retrieval_time = time.time() - start_time - - # Extract only metadata - doc_metadata = [{"doc_id": doc.metadata.get("id", i), "source": doc.metadata.get("source", "unknown")} - for i, doc in enumerate(retrieved_docs)] - - # Log metadata instead of full documents - mlflow.log_metric("retrieval_time", retrieval_time) - mlflow.log_param("retrieved_docs_count", len(retrieved_docs)) - mlflow.log_dict(doc_metadata, "retrieved_docs.json") - - return {"context": retrieved_docs} - -# Initialize LLM once and store in a global variable -llm = get_llm() -# Initialize prompt once and store in a global variable -prompt = get_prompt() -def generate(state: State): - with mlflow.start_run(nested=True, run_name="generation"): - start_time = time.time() - docs_content = "\n\n".join(doc.page_content for doc in state["context"]) - token_count = len(docs_content.split()) - # Use the global prompt instance - mlflow.log_param("retrieved_tokens", token_count) - mlflow.log_param("context_length", len(docs_content)) - messages = prompt.invoke({"question": state["question"], "context": docs_content}) - response = llm.invoke(messages) - generation_time = time.time() - start_time - - # Log LLM generation performance - mlflow.log_metric("generation_time", generation_time) - mlflow.log_param("response_length", len(response.content.split())) - mlflow.log_param("model_name", "mistral-large-latest") - - # Save response - # with open("response.txt", "w") as f: - # f.write(response.content) - # mlflow.log_artifact("response.txt") - - return {"answer": response.content} - - -def generateResponse(query): -# Compile application and test - try: - with mlflow.start_run(run_name="RAG_Pipeline"): - mlflow.log_param("query", query) - graph_builder = StateGraph(State).add_sequence([retrieve, generate]) - graph_builder.add_edge(START, "retrieve") - graph = graph_builder.compile() - response = graph.invoke({"question": f"{query}"}) - mlflow.log_param("final_answer", response["answer"]) - return response["answer"] - except Exception as e: - mlflow.log_param("error", str(e)) - raise Exception(e) - -async def checkModel_fairness(): - auto_object = AutoEval( - prompts=["tell me about khoury"], - langchain_llm=llm, - # toxicity_device=device # uncomment if GPU is available - ) - results = await auto_object.evaluate() - print(results['metrics']) - -if __name__ == "__main__": - - query=input("generate query") - response=generateResponse(query) - print(response) - #uncomment and enter prompts for model fairness and there is a limitation on api key - # asyncio.run(checkModel_fairness()) \ No newline at end of file diff --git a/prefectWorkflows/dataflow/scraper.py b/prefectWorkflows/dataflow/scraper.py index 9a0e5f9..4676a4a 100644 --- a/prefectWorkflows/dataflow/scraper.py +++ b/prefectWorkflows/dataflow/scraper.py @@ -6,14 +6,30 @@ import re from urllib.parse import urljoin, urlparse from dotenv import load_dotenv - +import hashlib from dataflow.store_data import upload_many_blobs_with_transfer_manager load_dotenv(override=True) # Configuration -BASE_URL = os.getenv('BASE_URL') +URLS_LIST=list(os.getenv('URLS_LIST','').split(",")) + +# BASE_URL ="" #URLS_LIST[0]#os.getenv('BASE_URL') MAX_DEPTH = int(os.getenv('MAX_DEPTH')) # Maximum recursion depth (base URL is depth 0) CONCURRENT_REQUESTS = int(os.getenv('CONCURRENT_REQUESTS')) # Maximum number of concurrent requests -GOOGLE_APPLICATION_CREDENTIALS =os.getenv('GOOGLE_APPLICATION_CREDENTIALS ') + +from google.auth import default +from google.oauth2 import service_account + +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") # Create folder for JSON data DATA_FOLDER = "scraped_data" if not os.path.exists(DATA_FOLDER): @@ -24,7 +40,8 @@ def safe_filename(url): parsed = urlparse(url) path = parsed.path.strip('/') or 'index' filename = re.sub(r'[^A-Za-z0-9_\-]', '_', path) + ".json" - return os.path.join(DATA_FOLDER, filename) + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + return os.path.join(DATA_FOLDER, f"{filename}_{url_hash}.json") async def fetch(session, url, semaphore): """Fetch the content of the URL asynchronously.""" @@ -39,7 +56,7 @@ async def fetch(session, url, semaphore): print(f"Error fetching {url}: {e}") return None -async def async_scrape(url, depth=0, session=None, semaphore=None): +async def async_scrape(url,BASE_URL, depth=0, session=None, semaphore=None): """Recursively scrape pages asynchronously and store in JSON format.""" if depth > MAX_DEPTH: return @@ -81,26 +98,33 @@ async def async_scrape(url, depth=0, session=None, semaphore=None): next_url = urljoin(url, link['href']) if urlparse(next_url).netloc == urlparse(BASE_URL).netloc: next_url = next_url.split('#')[0] # Remove fragments - tasks.append(async_scrape(next_url, depth + 1, session, semaphore)) + tasks.append(async_scrape(next_url,BASE_URL, depth + 1, session, semaphore)) if tasks: await asyncio.gather(*tasks) -async def scrape_and_load(): +async def scrape_and_load(CURRENT_URl): """Main function to initiate scraping.""" semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) async with aiohttp.ClientSession() as session: - await async_scrape(BASE_URL, depth=0, session=session, semaphore=semaphore) + await async_scrape(CURRENT_URl,BASE_URL=CURRENT_URl, depth=0, session=session, semaphore=semaphore) def scrape_and_load_task(): - asyncio.run(scrape_and_load()) + for url in URLS_LIST: + BASE_URL=url + asyncio.run(scrape_and_load(BASE_URL)) + print("*"*15) + print(f"scraping {url} done") + print("*"*15) + upload_many_blobs_with_transfer_manager() return if __name__ == '__main__': - asyncio.run(scrape_and_load()) - upload_many_blobs_with_transfer_manager() \ No newline at end of file + scrape_and_load_task() + # asyncio.run(scrape_and_load()) + # upload_many_blobs_with_transfer_manager() \ No newline at end of file diff --git a/prefectWorkflows/dataflow/store_data.py b/prefectWorkflows/dataflow/store_data.py index 039dc22..e89ad85 100644 --- a/prefectWorkflows/dataflow/store_data.py +++ b/prefectWorkflows/dataflow/store_data.py @@ -5,8 +5,20 @@ BUCKET_NAME= os.getenv('BUCKET_NAME') RAW_DATA_FOLDER= os.getenv('RAW_DATA_FOLDER') FAISS_INDEX_FOLDER= os.getenv('FAISS_INDEX_FOLDER') -GOOGLE_APPLICATION_CREDENTIALS=os.getenv('GOOGLE_APPLICATION_CREDENTIALS') +from google.auth import default +from google.oauth2 import service_account +# Try to get credentials - works in both Docker and Cloud Run +try: + # First try Application Default Credentials (works in Cloud Run) + credentials, project = default() +except Exception: + # Fall back to explicit credentials file (for Docker) + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + else: + raise Exception("No credentials available") def get_blob_from_bucket(): storage_client = Client() bucket = storage_client.bucket(BUCKET_NAME) @@ -19,7 +31,7 @@ def get_blob_from_bucket(): def upload_many_blobs_with_transfer_manager( - + workers=8 ): """Upload every file in a list to a bucket, concurrently in a process pool. diff --git a/prefectWorkflows/prefect.yaml b/prefectWorkflows/prefect.yaml index ab669e5..879a8bd 100644 --- a/prefectWorkflows/prefect.yaml +++ b/prefectWorkflows/prefect.yaml @@ -1,47 +1,20 @@ -# Welcome to your prefect.yaml file! You can use this file for storing and managing -# configuration for deploying your flows. We recommend committing this file to source -# control along with your flow code. +# Prefect deployment configuration for the scraper_flow +name: scraper-flow-project # Name of the project (can be any identifier for your reference) +prefect-version: 3.1.10 # Prefect version to use for this deployment (match your Prefect 3.x version) -# Generic metadata about this project -name: prefectWorkflows -prefect-version: 3.2.15 - -# build section allows you to manage and build docker images -build: - -# push section allows you to manage if and how this project is uploaded to remote locations -push: - -# # pull section allows you to provide instructions for cloning this project in remote locations -# pull: -# - prefect.deployments.steps.git_clone: -# repository: https://github.com/Nikhil-Kudupudi/NUBot.git -# branch: gcs_bucket -# access_token: - -# the deployments section allows you to provide configuration for deploying flows deployments: -- name: - version: - tags: [] - description: - schedule: {} - flow_name: - entrypoint: - parameters: {} - work_pool: - name: - work_queue_name: - job_variables: {} -- name: default - version: - tags: [] - concurrency_limit: - description: - entrypoint: scraper_flow.py:scraperflow - parameters: {} - work_pool: - name: nubot_dataflow - work_queue_name: - job_variables: {} - schedules: [] + - name: scraperflow-deployment # Name of this deployment (appears in Prefect UI) + description: "Scrapes all URLs and segments data every Saturday at 9:00 UTC" + entrypoint: scraper_flow.py:scraper_flow # Entry point to the flow: "