fix: added runnables to create usable models from served endpoint

broomva · May 12, 2024 · 08cb77c · 08cb77c
1 parent 0197998
commit 08cb77c
Show file tree

Hide file tree

Showing 8 changed files with 351 additions and 22 deletions.
diff --git a/arcan/ai/llm/__init__.py b/arcan/ai/llm/__init__.py
@@ -3,6 +3,7 @@
 import os
 from typing import Any, Callable, Dict, List, Optional, Union
 
+from langchain_groq import ChatGroq
 from langchain_openai import ChatOpenAI, OpenAI
 from pydantic import BaseModel
 
@@ -84,6 +85,13 @@ class LLMFactory:
                 os.getenv("OPENAI_API_BASE_URL", "https://api.together.xyz/v1"),
             ),
         ),
+        "ChatGroq": lambda **kwargs: ChatGroq(
+            temperature=kwargs.get("temperature", 0.3),
+            model_name=kwargs.get(
+                "model",
+                os.getenv("TOGETHER_MODEL_NAME", "llama3-8b-8192"),
+            ),
+        ),
     }
 
     @staticmethod

diff --git a/arcan/ai/runnables/__init__.py b/arcan/ai/runnables/__init__.py
@@ -0,0 +1,61 @@
+#%%
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.runnables import Runnable
+from langserve import RemoteRunnable
+
+
+class RunnableFactory:
+    def __init__(self, base_url: str = "http://localhost:8000/"):
+        self.base_url = base_url
+        self.runnable_cache = {}
+
+    def get_runnable(self, runnable_name: str, cache: bool = True) -> Runnable:
+        if cache and runnable_name in self.runnable_cache:
+            return self.runnable_cache[runnable_name]
+
+        runnable = RemoteRunnable(self.base_url + runnable_name + "/")
+        if cache:
+            self.runnable_cache[runnable_name] = runnable
+        return runnable
+
+class ArcanRunnables:
+    def __init__(self, base_url: str = "http://localhost:8000/"):
+        self.factory = RunnableFactory(base_url=base_url)
+
+    def get_chat_spells_agent_runnable(self):
+        return self.factory.get_runnable(runnable_name="spells_agent")
+
+    def get_openai_runnable(self):
+        return self.factory.get_runnable(runnable_name="openai")
+
+    def get_groq_runnable(self):
+        return self.factory.get_runnable(runnable_name="groq")
+
+# %%
+
+
+
+# from langchain.schema import HumanMessage, SystemMessage
+# from langchain.schema.runnable import RunnableMap
+
+# arcan_runnables = ArcanRunnables(base_url="http://localhost:8000/")
+# chat_spells_agent = arcan_runnables.get_chat_spells_agent_runnable()
+# openai_runnable = arcan_runnables.get_openai_runnable()
+# groq_runnable = arcan_runnables.get_groq_runnable()
+
+
+# prompt = ChatPromptTemplate.from_messages(
+#     [("system", "Tell me a long story about {topic}")]
+# )
+
+# # Can define custom chains
+# chain = prompt | RunnableMap({
+#     "openai": openai_runnable,
+#     "groq": groq_runnable,
+# })
+# # %%
+
+# chain.batch([{"topic": "parrots"}, {"topic": "cats"}])
+
+
+# %%
diff --git a/arcan/ai/tools/__init__.py b/arcan/ai/tools/__init__.py
@@ -7,7 +7,8 @@
 from langchain_core.utils.function_calling import convert_to_openai_function
 from langchain_experimental.utilities import PythonREPL
 
-from arcan.spells.scrapping import scrape_website, scrape_website_selenium
+from arcan.spells.scrapping import (firecrawl_scrape, scrape_website,
+                                    scrape_website_selenium)
 from arcan.spells.search import serper_api_search
 
 load_dotenv()
@@ -51,6 +52,12 @@ def get_word_length(word: str) -> int:
     description="Useful when you need to get data from a website url and the regular Scrape Website method is not working correctly; DO NOT make up any url, the url should only be from the search results. Prefer Tavily seach tool over this one unless explicitly asked to perform a scrapping task",
 )
 
+firecrawl_tool = Tool(
+    name="firecrawl",
+    func=firecrawl_scrape,
+    description="Useful when you need to get data from a website url; DO NOT make up any url, use the one provided by the user.",
+)
+
 python_repl = PythonREPL()
 
 repl_tool = Tool(

diff --git a/arcan/api/__init__.py b/arcan/api/__init__.py
@@ -3,14 +3,15 @@
 
 from dotenv import load_dotenv
 from fastapi import Depends, FastAPI, Form, Request
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import RedirectResponse
 from langchain_core.messages import AIMessage, FunctionMessage, HumanMessage
-from langchain_core.runnables import Runnable
 from langserve import add_routes
 from langserve.pydantic_v1 import BaseModel, Field
 from sqlalchemy.orm import Session
 
 from arcan.ai.agents import ArcanSpellsAgent
+from arcan.ai.llm import LLM
 from arcan.api.datamodels import get_db, get_db_context
 from arcan.api.session import ArcanSession, run_agent
 
@@ -26,11 +27,17 @@
 app = FastAPI()
 
 
-# @app.get("/")
-# def default():
-#     return {
-#         "message": "Check out the API documentation at http://arcanai.tech/api/docs"
-#     }
+
+# Set all CORS enabled origins
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["*"],
+)
+
 
 @app.get("/")
 async def redirect_root_to_docs():
@@ -42,13 +49,6 @@ async def index():
     return {"message": "Arcan is Running!"}
 
 
-# @app.get("/api/chat/{user_id}")
-# async def api_user_chat(user_id: str, query: str, db: Session = Depends(get_db)):
-#     arcan_session = ArcanSession(db)
-#     response = run_agent(session=arcan_session, user_id=user_id, query=query)
-#     return {"response": response}
-
-
 # @requires_auth
 @app.get("/api/chat")
 async def chat(user_id: str, query: str, db: Session = Depends(get_db)):
@@ -58,7 +58,6 @@ async def chat(user_id: str, query: str, db: Session = Depends(get_db)):
 
 #%%
 
-
 class Input(BaseModel):
     input: str
     chat_history: List[Union[HumanMessage, AIMessage, FunctionMessage]] = Field(
@@ -70,13 +69,27 @@ class Output(BaseModel):
     output: Any
 
 
-def get_runnable() -> Runnable:
-    return ArcanSpellsAgent().agent_executor
-
-
 add_routes(
     app=app,
-    runnable=get_runnable().with_types(input_type=Input, output_type=Output).with_config({"run_name": "agent"}),
+    runnable=ArcanSpellsAgent().agent_executor.with_types(input_type=Input, output_type=Output).with_config({"run_name": "agent"}),
     path="/spells_agent",
     enable_feedback_endpoint=True,
 )
+
+add_routes(
+    app,
+    LLM(provider='ChatOpenAI').llm,
+    path="/openai",
+)
+
+add_routes(
+    app,
+    LLM(provider='ChatGroq').llm,
+    path="/groq",
+)
+
+add_routes(
+    app,
+    LLM(provider='ChatTogetherAI').llm,
+    path="/together",
+)
diff --git a/arcan/spells/scrapping.py b/arcan/spells/scrapping.py
@@ -6,6 +6,12 @@
 import html2text
 import requests
 from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from firecrawl import FirecrawlApp
+from langchain.agents import Tool
+from langchain_community.tools import WikipediaQueryRun
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_community.utilities import WikipediaAPIWrapper
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 
@@ -128,3 +134,92 @@ def url_text_scrapper(url: str):
         file_path.write_text(scrapped_text)
 
     return scrapped_text, clean_domain
+
+
+def firecrawl_loader(url: str, mode: str = "scrape"):
+    from langchain_community.document_loaders import FireCrawlLoader
+    loader = FireCrawlLoader(
+        api_key=os.environ.get("FIRECRAWL_API_KEY"), 
+        url=url, 
+        mode=mode # scrape: Scrape single url and return the markdown.
+                    # crawl: Crawl the url and all accessible sub pages and return the markdown for each one.
+    )
+    return loader
+
+
+
+
+def firecrawl_scrape(url):
+    """
+    The function `firecrawl_scrape` takes a URL as input and uses the FirecrawlApp class to scrape the
+    content of the webpage at that URL.
+
+    :param url: The `url` parameter in the `firecrawl_scrape` function is a string that represents the
+    URL of the webpage that you want to scrape using the FirecrawlApp
+    :return: The `firecrawl_scrape` function is returning the result of calling the `scrape_url` method
+    of a `FirecrawlApp` instance with the provided `url` as an argument. It is a markdown string of the
+    scraped content of the webpage at the provided URL.
+    """
+    return FirecrawlApp().scrape_url(url, {
+    'extractorOptions': {
+        'mode': 'llm-extraction',
+        'extractionPrompt': 'Extract the key elements, segment by NER, and summarize the content. Make sure the returned content is at most 16385 tokens'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+    })
+
+
+
+from pydantic import AnyHttpUrl
+
+
+def scrapegraph_scrape(url: AnyHttpUrl, prompt: str):
+    from scrapegraphai.graphs import SmartScraperGraph
+    graph_config = {
+        "llm": {
+            "model": "ollama/mistral",
+            "temperature": 0,
+            "format": "json",  # Ollama needs the format to be specified explicitly
+            "base_url": "http://localhost:11434",  # set Ollama URL
+        },
+        "embeddings": {
+            "model": "ollama/nomic-embed-text",
+            "base_url": "http://localhost:11434",  # set Ollama URL
+        },
+        "verbose": True,
+    }
+
+    smart_scraper_graph = SmartScraperGraph(
+            prompt=prompt,
+            # also accepts a string with the already downloaded HTML code
+            source=url.__str__(),
+            config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+
+
+from pydantic import FilePath
+
+
+async def llama_parse_scrape(pdf_path: FilePath):
+    import nest_asyncio
+
+    nest_asyncio.apply()
+
+    from llama_parse import LlamaParse
+
+    parser = LlamaParse(
+        api_key=os.environ.get("LLAMA_CLOUD_API_KEY"),
+        result_type="markdown",  # "markdown" and "text" are available
+        num_workers=4,  # if multiple files passed, split in `num_workers` API calls
+        verbose=True,
+        language="en",  # Optionally you can define a language, default=en
+    )
+
+    # async
+    documents = await parser.aload_data(pdf_path)
+    return documents