NASA-IMPACT · jbrry · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/.env.example b/.env.example
@@ -39,9 +39,23 @@ SEARXNG_STRICT=False
 SEARXNG_DEBUG=False
 SEARXNG_ENGINES="google,arxiv,google_scholar"
 
+# For FactCheck
+FACT_CHECK_API_URL="https://factreasoner-service-app.1yhbkn094k2v.us-south.codeengine.appdomain.cloud"
+FACT_CHECK_JOB_TIMEOUT="1800"
+FACT_CHECK_REQUEST_TIMEOUT="60"
+FACT_CHECK_START_ENDPOINT="/fact-check/start"
+FACT_CHECK_STATUS_ENDPOINT="/fact-check/status"
+FACT_CHECK_CORRECT_ENDPOINT="/correct"
+FACT_CHECK_DISPLAY_GRAPH_ENDPOINT="/display_graph"
+FACT_CHECK_GRAPH_JSON_ENDPOINT="/graph/json"
+
 # for your vector store
 EMBEDDING_MODEL_ID="nasa-impact/nasa-smd-ibm-st-v2"
 
+VECTOR_DB_PATH="./chroma_db"
+# API key for various embedding functions (chroma)
+EMBEDDING_MODEL_API_KEY=""
+
 # number of wiki results to return
 TOP_N_WIKI_RESULTS = 1
 

diff --git a/akd/tools/fact_check.py b/akd/tools/fact_check.py
@@ -0,0 +1,178 @@
+import asyncio
+import os
+from typing import Any, Dict, List, Optional
+
+import httpx
+from loguru import logger
+from pydantic import Field, HttpUrl
+
+from akd._base import InputSchema, OutputSchema
+from akd.tools._base import BaseTool, BaseToolConfig
+
+
+class FactCheckInputSchema(InputSchema):
+    """Input schema for the Fact-Checking Tool."""
+
+    question: str = Field(..., description="The original question that was asked.")
+    answer: str = Field(..., description="The LLM answer to be fact-checked.")
+
+
+class FactCheckOutputSchema(OutputSchema):
+    """Output schema for the Fact-Checking Tool's results."""
+
+    fact_reasoner_score: Dict[str, Any] = Field(
+        ...,
+        description="The full scoring dictionary from the FactReasoner.",
+    )
+    supported_atoms: List[Dict[str, Any]] = Field(
+        ...,
+        description="List of atoms determined to be supported.",
+    )
+    not_supported_atoms: List[Dict[str, Any]] = Field(
+        ...,
+        description="List of atoms determined to be not supported.",
+    )
+    contexts: List[Dict[str, Any]] = Field(
+        ...,
+        description="List of retrieved contexts used for the check.",
+    )
+    graph_id: Optional[str] = Field(
+        None,
+        description="The unique ID for the generated fact graph.",
+    )
+    logging_metadata: Dict[str, Any] = Field(
+        {},
+        description="Additional logging metadata from the run.",
+    )
+
+
+class FactCheckToolConfig(BaseToolConfig):
+    """Configuration for the FactCheckTool."""
+
+    base_url: HttpUrl = Field(
+        default=HttpUrl(os.getenv("FACT_CHECK_API_URL", "http://localhost:8011")),
+        description="The base URL of the remote Fact-Checking and Correction Service.",
+    )
+    start_endpoint: str = Field(
+        default=os.getenv("FACT_CHECK_START_ENDPOINT", "/fact-check/start"),
+        description="Endpoint to start a new fact-checking job."
+    )
+    status_endpoint: str = Field(
+        default=os.getenv("FACT_CHECK_STATUS_ENDPOINT", "/fact-check/status/"),
+        description="Endpoint to get the status of a job. Must end with a slash."
+    )
+    correct_endpoint: str = Field(
+        default=os.getenv("FACT_CHECK_CORRECT_ENDPOINT", "/correct/"),
+        description="Endpoint for single correction steps."
+    )
+    display_graph_endpoint: str = Field(
+        default=os.getenv("FACT_CHECK_DISPLAY_GRAPH_ENDPOINT", "/display_graph/"),
+        description="Endpoint to display a saved fact graph."
+    )
+    graph_json_endpoint: str = Field(
+        default=os.getenv("FACT_CHECK_GRAPH_JSON_ENDPOINT", "/graph/json/"),
+        description="Endpoint to retrieve graph data as JSON."
+    )
+    polling_interval_seconds: int = Field(
+        default=120,
+        description="How often to poll for job results.",
+    )
+    job_timeout_seconds: int = Field(
+        default=int(os.getenv("FACT_CHECK_JOB_TIMEOUT", "1800")),
+        description="Maximum time to wait for the entire job to complete (30 minutes).",
+    )
+    request_timeout_seconds: int = Field(
+        default=int(os.getenv("FACT_CHECK_REQUEST_TIMEOUT", "60")),
+        description="Timeout in seconds for each individual API request.",
+    )
+
+
+class FactCheckTool(
+    BaseTool[FactCheckInputSchema, FactCheckOutputSchema],
+):
+    """
+    A tool that calls an API to perform fact-checking on a given answer.
+    """
+
+    name = "fact_check_tool"
+    description = (
+        "Calls an API to run the FactReasoner pipeline on a question and answer."
+    )
+    input_schema = FactCheckInputSchema
+    output_schema = FactCheckOutputSchema
+    config_schema = FactCheckToolConfig
+
+    def __init__(
+        self,
+        config: FactCheckToolConfig | None = None,
+        debug: bool = False,
+    ):
+        """Initializes the FactCheckTool and its HTTP client."""
+        config = config or FactCheckToolConfig()
+        super().__init__(config, debug)
+
+        logger.info("Initializing FactCheckTool...")
+        # Set a timeout on the API requests
+        timeout = httpx.Timeout(self.config.request_timeout_seconds, connect=60.0)
+        self.api_client = httpx.AsyncClient(
+            base_url=str(self.config.base_url),
+            timeout=timeout,
+        )
+
+    async def _arun(
+        self,
+        params: FactCheckInputSchema,
+    ) -> FactCheckOutputSchema:
+        """
+        Starts a fact-checking job and polls for its completion.
+        """
+        logger.info(
+            f"Sending fact-check request for question: '{params.question[:50]}...'",
+        )
+
+        try:
+            # Start the job
+            start_response = await self.api_client.post(
+                self.config.start_endpoint,
+                json=params.model_dump(),
+            )
+            start_response.raise_for_status()
+            job_id = start_response.json()["job_id"]
+            logger.info(f"Successfully started job with ID: {job_id}")
+
+            # Poll for the result
+            total_wait_time = 0
+            while total_wait_time < self.config.job_timeout_seconds:
+                logger.info(f"Polling status for job {job_id}...")
+                status_response = await self.api_client.get(
+                    f"{self.config.status_endpoint}/{job_id}",
+                )
+                status_response.raise_for_status()
+                status_data = status_response.json()
+
+                if status_data["status"] == "completed":
+                    logger.info(f"Job {job_id} completed successfully.")
+                    return FactCheckOutputSchema(**status_data["result"])
+                elif status_data["status"] == "failed":
+                    raise Exception(
+                        f"Job {job_id} failed on the server: {status_data.get('error', 'Unknown error')}",
+                    )
+                elif status_data["status"] == "pending":
+                    logger.info(
+                        f"Job {job_id} is in progress... (waited {total_wait_time}s)",
+                    )
+                    await asyncio.sleep(self.config.polling_interval_seconds)
+                    total_wait_time += self.config.polling_interval_seconds
+
+            raise asyncio.TimeoutError(
+                f"Job {job_id} did not complete within the {self.config.job_timeout_seconds}s timeout.",
+            )
+
+        except httpx.HTTPStatusError as e:
+            logger.error(
+                f"HTTP error occurred while calling fact-check API: {e.response.status_code} - {e.response.text}",
+            )
+            raise
+        except Exception as e:
+            logger.error(f"An unexpected error occurred: {e}")
+            raise
diff --git a/akd/tools/vector_db_tool.py b/akd/tools/vector_db_tool.py
@@ -0,0 +1,146 @@
+import os
+from typing import Any, Dict, List, Optional
+
+import chromadb
+import chromadb.utils.embedding_functions as embedding_functions
+from loguru import logger
+from pydantic import Field
+
+from akd._base import InputSchema, OutputSchema
+from akd.tools._base import BaseTool, BaseToolConfig
+from akd.utils import get_akd_root
+
+
+class VectorDBIndexInputSchema(InputSchema):
+    """Input schema for indexing documents into the Vector Database."""
+
+    ids: List[str] = Field(..., description="A unique list of document IDs.")
+    documents: List[str] = Field(
+        ...,
+        description="A list of document contents to index.",
+    )
+    metadatas: Optional[List[Dict[str, Any]]] = Field(
+        None,
+        description="Optional list of metadata for each document.",
+    )
+
+
+class VectorDBQueryInputSchema(InputSchema):
+    """Input schema for querying documents from the Vector Database."""
+
+    query: str = Field(..., description="The query string for retrieval.")
+    k: int = Field(3, description="Number of documents to retrieve.")
+
+
+class VectorDBQueryOutputSchema(OutputSchema):
+    """Output schema for the Vector Database tool's query results."""
+
+    results: List[Dict[str, Any]] = Field(
+        ...,
+        description="List of retrieved documents, each as a dictionary with 'page_content' and 'metadata'.",
+    )
+
+
+class VectorDBToolConfig(BaseToolConfig):
+    """Configuration for the VectorDBTool, loaded from environment variables."""
+
+    embedding_model_name: str = Field(
+        default="sentence-transformers/all-MiniLM-L6-v2",
+        description="The name of the Hugging Face embedding model to use.",
+    )
+    embedding_model_api_key: Optional[str] = Field(
+        default=os.getenv("EMBEDDING_MODEL_API_KEY", None),
+        description="The API key for the embedding model provider, if required.",
+    )
+    db_path: str = Field(
+        default=os.getenv("VECTOR_DB_PATH", str(get_akd_root() / "chroma_db")),
+        description="Path to the persistent ChromaDB directory.",
+    )
+    collection_name: str = Field(
+        default="akd_vdb",
+        description="Name of the collection within ChromaDB.",
+    )
+
+
+class VectorDBTool(
+    BaseTool[VectorDBQueryInputSchema, VectorDBQueryOutputSchema],
+):
+    """
+    A tool for indexing and retrieving documents from a Chroma vector database.
+    """
+
+    name = "vector_db_tool"
+    description = (
+        "Indexes documents into a vector database and retrieves them based on a query."
+    )
+    input_schema = VectorDBQueryInputSchema
+    output_schema = VectorDBQueryOutputSchema
+    config_schema = VectorDBToolConfig
+
+    def __init__(
+        self,
+        config: Optional[VectorDBToolConfig] = None,
+        debug: bool = False,
+    ):
+        """Initializes the VectorDBTool and its ChromaDB client."""
+        config = config or VectorDBToolConfig()
+        super().__init__(config, debug)
+
+        logger.info("Initializing VectorDBTool...")
+        self.client = chromadb.PersistentClient(path=self.config.db_path)
+
+        embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name=self.config.embedding_model_name,
+        )
+        self.collection = self.client.get_or_create_collection(
+            name=self.config.collection_name,
+            embedding_function=embedding_function,
+        )
+        logger.info(
+            f"Connected to ChromaDB collection '{self.config.collection_name}'.",
+        )
+
+    def index(self, params: VectorDBIndexInputSchema):
+        """
+        Adds or updates documents in the vector database collection.
+        """
+        logger.info(f"Indexing {len(params.documents)} documents...")
+        self.collection.add(
+            ids=params.ids,
+            documents=params.documents,
+            metadatas=params.metadatas,
+        )
+        logger.info("Indexing complete.")
+
+    async def _arun(
+        self,
+        params: VectorDBQueryInputSchema,
+    ) -> VectorDBQueryOutputSchema:
+        """
+        Retrieves documents and returns them as a list of dictionaries.
+        """
+        logger.info(
+            f"Querying collection with query: '{params.query}', retrieving top-{params.k} documents",
+        )
+
+        results = self.collection.query(
+            query_texts=[params.query],
+            n_results=params.k,
+            include=["metadatas", "documents"],
+        )
+
+        retrieved_docs = []
+        if results and results.get("ids") and results["ids"][0]:
+            result_documents = results["documents"][0]
+            result_metadatas = results["metadatas"][0]
+
+            for i in range(len(result_documents)):
+                doc = {
+                    "page_content": result_documents[i],
+                    "metadata": result_metadatas[i]
+                    if result_metadatas and result_metadatas[i]
+                    else {},
+                }
+                retrieved_docs.append(doc)
+
+        return VectorDBQueryOutputSchema(results=retrieved_docs)
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,9 @@ dependencies = [
     "ollama>=0.5.1",
     "tiktoken>=0.9.0",
     "rapidfuzz>=3.13.0",
+    "deepeval>=3.4.0",
+    "markdown>=3.8.2",
+    "chromadb>=1.0.13",
     "undetected-chromedriver>=3.5.5",
     "pypaperbot @ git+https://github.com/NISH1001/PyPaperBot.git@develop",
 ]
@@ -89,6 +92,9 @@ dev = [
     "pytest>=8.0.0",
     "pytest-asyncio>=1.0.0",
     "pytest-cov>=6.0.0",
+    "pytest-mock>=3.14.1",
+    "ipykernel>=6.30.0",
+    "ipywidgets>=8.1.7",
     "pre-commit>=4.2.0",
 ]
 local = [
@@ -98,6 +104,7 @@ local = [
     "ipywidgets>=8.1.7",
 ]
 ml = [
+    "chromadb>=1.0.13",
     "pandas>=2.3.1",
     "sentence-transformers>=5.0.0",
     "docling>=2.37.0",