web3_graphrag_demo.py

import os
from datetime import datetime
import logging

from nano_graphrag.base import BaseKVStorage
import ollama
import numpy as np
from nano_graphrag import GraphRAG, QueryParam
from nano_graphrag._utils import compute_args_hash, wrap_embedding_func_with_attrs
from pdfToTxt import convert_pdf_to_txt
from web_scraper import WebScraper
from nano_graphrag.prompt import PROMPTS
import argparse
from logger_config import logger_config

# Get loggers
llm_logger = logger_config.get_llm_logger()
logger = logging.getLogger(__name__)

# Constants
WORKING_DIR = "./web3_corpus"
MODEL = "mistral:ctx32k"
PDF_DIR = "./cryptoKGTutorial/rawWhitePapers"
TXT_DIR = "./cryptoKGTutorial/txtWhitePapers"
WEB_CONTENT_DIR = "./cryptoKGTutorial/webContent"

# Create directories if they don't exist
os.makedirs(WORKING_DIR, exist_ok=True)
os.makedirs(TXT_DIR, exist_ok=True)
os.makedirs(WEB_CONTENT_DIR, exist_ok=True)

# Assumed embedding model settings
EMBEDDING_MODEL = "nomic-embed-text"
EMBEDDING_MODEL_DIM = 768
EMBEDDING_MODEL_MAX_TOKENS = 8192

# We're using Ollama to generate embeddings for the BGE model
@wrap_embedding_func_with_attrs(
    embedding_dim=EMBEDDING_MODEL_DIM,
    max_token_size=EMBEDDING_MODEL_MAX_TOKENS,
)
async def ollama_embedding(texts: list[str]) -> np.ndarray:
    embed_text = []
    for text in texts:
        data = ollama.embeddings(model=EMBEDDING_MODEL, prompt=text)
        embed_text.append(data["embedding"])

    return embed_text

# Ollama model function with logging
async def ollama_model_if_cache(
        prompt,
        system_prompt="You are an intelligent assistant and will follow the instructions given to you to fulfill the goal. The answer should be in the format as in the given example.",
        history_messages=[], **kwargs
        ) -> str:
    # remove kwargs that are not supported by ollama
    kwargs.pop("max_tokens", None)
    kwargs.pop("response_format", None)

    ollama_client = ollama.AsyncClient()
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    # Get the cached response if having-------------------
    hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
    messages.extend(history_messages)
    messages.append({"role": "user", "content": prompt})
    
    if hashing_kv is not None:
        args_hash = compute_args_hash(MODEL, messages)
        if_cache_return = await hashing_kv.get_by_id(args_hash)
        if if_cache_return is not None:
            return if_cache_return["return"]
    # -----------------------------------------------------

    response = await ollama_client.chat(model=MODEL, messages=messages, **kwargs)
    result = response["message"]["content"]
    llm_logger.debug(f"LLM Response: {result}")
    
    # Cache the response if having-------------------
    if hashing_kv is not None:
        await hashing_kv.upsert({args_hash: {"return": result, "model": MODEL}})
    # -----------------------------------------------------
    return result

def remove_if_exist(file):
    if os.path.exists(file):
        os.remove(file)

def convert_pdfs():
    # Convert PDFs to TXT files
    convert_pdf_to_txt(PDF_DIR, TXT_DIR)
    logging.info("PDF to TXT conversion completed.")

def insert_documents(exclude_file=None):
    """
    Insert documents into GraphRAG.
    
    Args:
        exclude_file (str, optional): Filename to exclude from insertion. 
                                    If None, all files will be inserted.
    """
    logger.info("=== Starting Document Insertion ===")
    # Remove existing GraphRAG files
    remove_if_exist(f"{WORKING_DIR}/vdb_entities.json")
    remove_if_exist(f"{WORKING_DIR}/kv_store_full_docs.json")
    remove_if_exist(f"{WORKING_DIR}/kv_store_text_chunks.json")
    remove_if_exist(f"{WORKING_DIR}/kv_store_community_reports.json")
    remove_if_exist(f"{WORKING_DIR}/graph_chunk_entity_relation.graphml")

    rag = GraphRAG(
        working_dir=WORKING_DIR,
        enable_llm_cache=True,
        best_model_func=ollama_model_if_cache,
        cheap_model_func=ollama_model_if_cache,
        embedding_func=ollama_embedding,
    )

    # Insert TXT files
    for filename in os.listdir(TXT_DIR):
        if filename.endswith(".txt"):
            # Skip excluded file if specified
            if exclude_file and filename == exclude_file:
                logger.info(f"Skipping excluded file: {filename}")
                continue
            
            logger.info(f"=== Processing {filename} ===")
            with open(os.path.join(TXT_DIR, filename), "r", encoding="utf-8") as f:
                content = f.read()
            nodes_before, edges_before = get_graph_size(rag)
            logger.info(f"Inserting content from {filename}...")
            rag.insert(content)
            nodes_after, edges_after = get_graph_size(rag)
            
            if nodes_after > nodes_before or edges_after > edges_before:
                logger.info(f"Added {filename} to knowledge base")
                logger.info(f"Graph changes: Nodes {nodes_before}->{nodes_after}, Edges {edges_before}->{edges_after}")
    
    # Insert web content
    for filename in os.listdir(WEB_CONTENT_DIR):
        if filename.endswith(".txt"):
            logger.info(f"=== Processing Web Content: {filename} ===")
            with open(os.path.join(WEB_CONTENT_DIR, filename), "r", encoding="utf-8") as f:
                content = f.read()
            nodes_before, edges_before = get_graph_size(rag)
            logger.info(f"Inserting content from {filename}...")
            rag.insert(content)
            nodes_after, edges_after = get_graph_size(rag)
            
            if nodes_after > nodes_before or edges_after > edges_before:
                logger.info(f"Added web content {filename} to knowledge base")
                logger.info(f"Graph changes: Nodes {nodes_before}->{nodes_after}, Edges {edges_before}->{edges_after}")
    
    logger.info("=== Document Insertion Complete ===")

def query_rag(query="What are the main concepts in Web3 technology?", query_mode="global"):
    """Query the knowledge base with optional auto-scraping capability."""
    rag = GraphRAG(
        working_dir=WORKING_DIR,
        best_model_func=ollama_model_if_cache,
        cheap_model_func=ollama_model_if_cache,
        embedding_func=ollama_embedding,
    )

    result = rag.query(query, param=QueryParam(mode=query_mode))
    
    # Check if the answer indicates insufficient information
    if result == PROMPTS["fail_response"]:
        logger.info("Current knowledge base might not have sufficient information.")
        print("\nWould you like to search online for more information? (y/n): ")
        user_input = input().strip().lower()
        
        if user_input == 'y':
            # Backup the current database state
            backup_database()

            # Search online and update knowledge base
            logger.info("Searching online for relevant URLs...")
            urls = get_relevant_urls(query)
            if urls:
                scrape_and_insert_urls(urls, query)
                # Query again with updated knowledge base
                result = query_rag(query, query_mode)
            else:
                logger.warning("No relevant URLs found.")
    
    return result

def get_graph_size(rag):
    """Get the current size of the graph in terms of nodes and edges."""
    try:
        graph = rag.chunk_entity_relation_graph._graph
        nodes = graph.number_of_nodes()
        edges = graph.number_of_edges()
        return nodes, edges
    except Exception as e:
        logger.error(f"Error in get_graph_size: {str(e)}")
        import traceback
        logger.error(traceback.format_exc())
        return 0, 0

def extract_search_keywords(query):
    """Use LLM to extract the most important keyword or protocol name from the query."""
    logger.info("Extracting main keyword from query...")
    
    prompt = f"""Extract the single most important keyword from this query.
    If the query is about a specific protocol or technology, return its name.
    
    Rules:
    - Return only ONE main keyword or protocol name
    - For protocol queries, prioritize the protocol name
    - For technical queries, prioritize the main technology or concept
    
    Example 1:
    Query: "Tell me the details of signature authentication process of the 0x protocol"
    Keyword: 0x protocol
    
    Example 2:
    Query: "What is Namecoin (NMC)?"
    Keyword: Namecoin
    
    Query: "{query}"
    
    Keyword:"""
    
    try:
        # Use Ollama directly
        response = ollama.chat(model=MODEL, messages=[{"role": "user", "content": prompt}])
        keyword = response["message"]["content"]
        # Clean up the response
        keyword = keyword.strip().lower()
        logger.info(f"Extracted main keyword: {keyword}")
        return keyword
    except Exception as e:
        logger.error(f"Error extracting keyword: {str(e)}")
        return query  # Fallback to original query

def get_relevant_urls(query):
    """Search for relevant URLs based on the query."""
    scraper = WebScraper()
    
    # Extract main keyword using LLM
    main_keyword = extract_search_keywords(query)
    logger.info(f"Using main keyword for search: {main_keyword}")
    
    # First try with just the main keyword
    urls = scraper.search_urls(main_keyword)
    
    # If no results, try with the original query as fallback
    if not urls:
        logger.info("No results with main keyword, trying original query...")
        urls = scraper.search_urls(query)
    
    if urls:
        logger.info("Found the following relevant URLs:")
        for i, url in enumerate(urls, 1):
            logger.info(f"{i}. {url}")
        
        # Ask user which URLs to use
        print("\nEnter the numbers of the URLs you want to use (comma-separated) or 'all':")
        selection = input("Selection: ").strip().lower()
        
        if selection == 'all':
            return urls
        else:
            try:
                selected_indices = [int(i.strip()) - 1 for i in selection.split(',')]
                return [urls[i] for i in selected_indices if 0 <= i < len(urls)]
            except (ValueError, IndexError):
                logger.warning("Invalid selection. No URLs will be used.")
                return []
    
    logger.warning(f"No URLs found for keyword: {main_keyword}")
    return []

def scrape_and_insert_urls(urls, context_query=None):
    """Scrape content from URLs and insert relevant content into the knowledge base.
    First saves all content to files, then inserts into GraphRAG."""
    scraper = WebScraper()
    scraped_files = []  # Keep track of successfully scraped content files
    
    # Step 1: Scrape content and save to files
    logger.info("=== Scraping content and saving to files ===")
    for url in urls:
        logger.info(f"Scraping URL: {url}")
        content = scraper.scrape_url(url)
        
        if content is None:
            logger.warning(f"Failed to scrape content from {url}")
            continue
        
        try:
            # Save content to file
            filename = f"web_content_{hash(url)}.txt"
            filepath = os.path.join(WEB_CONTENT_DIR, filename)
            
            # Log content stats before saving
            sentences = content.split('.')
            paragraphs = content.split('\n\n')
            logger.info(f"Content stats - Sentences: {len(sentences)}, Paragraphs: {len(paragraphs)}, Words: {len(content.split())}")
            
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(content)
            logger.info(f"Saved content to {filepath}")
            scraped_files.append(filepath)
            
        except Exception as e:
            logger.error(f"Error saving content from {url}: {str(e)}")
            import traceback
            logger.error(traceback.format_exc())
    
    if not scraped_files:
        logger.warning("No content was successfully scraped and saved")
        return
    
    # Initialize GraphRAG
    logger.info("=== Inserting content into GraphRAG ===")
    rag = GraphRAG(
        working_dir=WORKING_DIR,
        enable_llm_cache=True,
        best_model_func=ollama_model_if_cache,
        cheap_model_func=ollama_model_if_cache,
        embedding_func=ollama_embedding,
    )
    
    # Insert content from files
    for filepath in scraped_files:
        try:
            logger.info(f"Processing file: {filepath}")
            
            # Read content from file
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
            
            # Get graph size before insertion
            nodes_before, edges_before = get_graph_size(rag)
            
            # Insert into GraphRAG
            logger.info("Inserting content into GraphRAG...")
            rag.insert(content)
            
            # Get graph size after insertion
            nodes_after, edges_after = get_graph_size(rag)
            
            # Log changes
            if nodes_after > nodes_before or edges_after > edges_before:
                logger.info(f"Graph changes - Nodes: +{nodes_after - nodes_before}, Edges: +{edges_after - edges_before}")
            else:
                logger.warning(f"Content from {filepath} did not add new knowledge to the graph")
                
        except Exception as e:
            logger.error(f"Error inserting content from {filepath}: {str(e)}")
            import traceback
            logger.error(traceback.format_exc())

def backup_database(name=None):
    """Create a backup of the current database state."""
    import shutil
    from datetime import datetime
    
    # Create backups directory if it doesn't exist
    backup_dir = "./backups"
    os.makedirs(backup_dir, exist_ok=True)
    
    # Generate backup name if not provided
    if name is None:
        name = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    backup_path = os.path.join(backup_dir, name)
    
    try:
        # Create a new directory for this backup
        os.makedirs(backup_path, exist_ok=True)
        
        # Copy all database files
        for filename in os.listdir(WORKING_DIR):
            if filename.endswith(('.json', '.graphml')):
                shutil.copy2(
                    os.path.join(WORKING_DIR, filename),
                    os.path.join(backup_path, filename)
                )
        
        # Also backup web content if it exists
        if os.path.exists(WEB_CONTENT_DIR) and os.listdir(WEB_CONTENT_DIR):
            web_backup_path = os.path.join(backup_path, "webContent")
            shutil.copytree(WEB_CONTENT_DIR, web_backup_path, dirs_exist_ok=True)
        
        logger.info(f"Database backup created successfully at: {backup_path}")
        return True
    except Exception as e:
        logger.error(f"Error creating backup: {str(e)}")
        return False

def list_backups():
    """List all available database backups."""
    backup_dir = "./backups"
    if not os.path.exists(backup_dir):
        logger.info("No backups found")
        return []
    
    backups = []
    for name in os.listdir(backup_dir):
        backup_path = os.path.join(backup_dir, name)
        if os.path.isdir(backup_path):
            # Check if it contains database files
            has_db_files = any(f.endswith(('.json', '.graphml')) 
                             for f in os.listdir(backup_path))
            if has_db_files:
                has_web = os.path.exists(os.path.join(backup_path, "webContent"))
                backups.append((name, has_web))
    
    if backups:
        logger.info("\nAvailable backups:")
        for name, has_web in backups:
            web_status = "with web content" if has_web else "PDF-only"
            logger.info(f"- {name} ({web_status})")
    else:
        logger.info("No backups found")
    
    return backups

def restore_database(name):
    """Restore the database from a backup."""
    import shutil
    
    backup_path = os.path.join("./backups", name)
    if not os.path.exists(backup_path):
        logger.warning(f"Backup '{name}' not found")
        return False
    
    try:
        # Clear current database files
        for filename in os.listdir(WORKING_DIR):
            if filename.endswith(('.json', '.graphml')):
                os.remove(os.path.join(WORKING_DIR, filename))
        
        # Restore database files
        for filename in os.listdir(backup_path):
            if filename.endswith(('.json', '.graphml')):
                shutil.copy2(
                    os.path.join(backup_path, filename),
                    os.path.join(WORKING_DIR, filename)
                )
        
        # Restore web content if it exists in the backup
        web_backup_path = os.path.join(backup_path, "webContent")
        if os.path.exists(web_backup_path):
            # Clear current web content
            if os.path.exists(WEB_CONTENT_DIR):
                shutil.rmtree(WEB_CONTENT_DIR)
            # Restore web content
            shutil.copytree(web_backup_path, WEB_CONTENT_DIR)
        
        logger.info(f"Database restored successfully from: {name}")
        return True
    except Exception as e:
        logger.error(f"Error restoring backup: {str(e)}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Web3 GraphRAG Demo")
    parser.add_argument("--convert", action="store_true", help="Convert PDFs to TXT files")
    parser.add_argument("--insert", action="store_true", help="Insert documents into GraphRAG")
    parser.add_argument("--exclude", type=str, help="Filename to exclude during insertion (optional)")
    parser.add_argument("--query", type=str, help="Query to run on the GraphRAG")
    parser.add_argument("--mode", choices=["global", "local"], default="global", help="Query mode: global or local")
    parser.add_argument("--backup", type=str, nargs='?', const='auto', help="Create a backup of current database state with optional name")
    parser.add_argument("--list-backups", action="store_true", help="List all available backups")
    parser.add_argument("--restore", type=str, help="Restore database from a specified backup")

    args = parser.parse_args()

    if args.list_backups:
        list_backups()
    elif args.backup:
        name = None if args.backup == 'auto' else args.backup
        backup_database(name)
    elif args.restore:
        restore_database(args.restore)
    elif args.convert:
        convert_pdfs()
    elif args.insert:
        insert_documents(exclude_file=args.exclude)
    elif args.query:
        result = query_rag(args.query, args.mode)
        logger.info(f"Query: {args.query}")
        logger.info(f"Result: {result}")
    else:
        parser.print_help()

if __name__ == "__main__":
    main()