diff --git a/backend/Dockerfile b/backend/Dockerfile index 4488129d9..b031e4425 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -6,13 +6,15 @@ EXPOSE 8000 RUN apt-get update && \ apt-get install -y --no-install-recommends \ libmagic1 \ - libgl1-mesa-glx \ + libgl1 \ + libglx-mesa0 \ libreoffice \ cmake \ poppler-utils \ tesseract-ocr && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* + # Set LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH # Copy requirements file and install Python dependencies @@ -20,6 +22,17 @@ COPY requirements.txt constraints.txt /code/ # --no-cache-dir --upgrade RUN pip install --upgrade pip RUN pip install -r requirements.txt -c constraints.txt + +RUN python -c "from transformers import AutoTokenizer, AutoModel; \ + name='sentence-transformers/all-MiniLM-L6-v2'; \ + tok=AutoTokenizer.from_pretrained(name); \ + mod=AutoModel.from_pretrained(name); \ + tok.save_pretrained('./local_model'); \ + mod.save_pretrained('./local_model')" + +RUN python -m nltk.downloader -d /usr/local/nltk_data punkt +RUN python -m nltk.downloader -d /usr/local/nltk_data averaged_perceptron_tagger + # Copy application code COPY . /code # Set command diff --git a/backend/requirements.txt b/backend/requirements.txt index 914bbf19d..ffb6f04a9 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -53,12 +53,12 @@ wrapt==1.17.2 yarl==1.20.1 youtube-transcript-api==1.1.0 zipp==3.23.0 -sentence-transformers==4.1.0 +sentence-transformers==5.0.0 google-cloud-logging==3.12.1 pypandoc==1.15 graphdatascience==1.15.1 Secweb==1.18.1 -ragas==0.2.15 +ragas==0.3.1 rouge_score==0.1.2 langchain-neo4j==0.4.0 pypandoc-binary==1.15 diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py index 1d3afb8e8..1a9e24eb3 100644 --- a/backend/src/QA_integration.py +++ b/backend/src/QA_integration.py @@ -38,7 +38,6 @@ load_dotenv() EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL') -EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL) class SessionChatHistory: history_dict = {} @@ -304,6 +303,7 @@ def create_document_retriever_chain(llm, retriever): output_parser = StrOutputParser() splitter = TokenTextSplitter(chunk_size=CHAT_DOC_SPLIT_SIZE, chunk_overlap=0) + EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL) embeddings_filter = EmbeddingsFilter( embeddings=EMBEDDING_FUNCTION, similarity_threshold=CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD @@ -344,7 +344,7 @@ def initialize_neo4j_vector(graph, chat_mode_settings): if not retrieval_query or not index_name: raise ValueError("Required settings 'retrieval_query' or 'index_name' are missing.") - + EMBEDDING_FUNCTION , _ = load_embedding_model(EMBEDDING_MODEL) if keyword_index: neo_db = Neo4jVector.from_existing_graph( embedding=EMBEDDING_FUNCTION, diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py index d47d000b2..0aefa11e7 100644 --- a/backend/src/document_sources/gcs_bucket.py +++ b/backend/src/document_sources/gcs_bucket.py @@ -46,46 +46,58 @@ def gcs_loader_func(file_path): return loader def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None): - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') - if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="": - if gcs_bucket_folder.endswith('/'): - blob_name = gcs_bucket_folder+gcs_blob_filename + + nltk.data.path.append("/usr/local/nltk_data") + nltk.data.path.append(os.path.expanduser("~/.nltk_data")) + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + for resource in ["punkt", "averaged_perceptron_tagger"]: + try: + nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"taggers/{resource}") + except LookupError: + logging.info(f"Downloading NLTK resource: {resource}") + nltk.download(resource, download_dir=os.path.expanduser("~/.nltk_data")) + + logging.info("NLTK resources downloaded successfully.") + if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="": + if gcs_bucket_folder.endswith('/'): + blob_name = gcs_bucket_folder+gcs_blob_filename + else: + blob_name = gcs_bucket_folder+'/'+gcs_blob_filename else: - blob_name = gcs_bucket_folder+'/'+gcs_blob_filename - else: - blob_name = gcs_blob_filename - - logging.info(f"GCS project_id : {gcs_project_id}") - - if access_token is None: - storage_client = storage.Client(project=gcs_project_id) - bucket = storage_client.bucket(gcs_bucket_name) - blob = bucket.blob(blob_name) + blob_name = gcs_blob_filename - if blob.exists(): - loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func) - pages = loader.load() - else : - raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.') - else: - creds= Credentials(access_token) - storage_client = storage.Client(project=gcs_project_id, credentials=creds) + logging.info(f"GCS project_id : {gcs_project_id}") - bucket = storage_client.bucket(gcs_bucket_name) - blob = bucket.blob(blob_name) - if blob.exists(): - content = blob.download_as_bytes() - pdf_file = io.BytesIO(content) - pdf_reader = PdfReader(pdf_file) - # Extract text from all pages - text = "" - for page in pdf_reader.pages: - text += page.extract_text() - pages = [Document(page_content = text)] + if access_token is None: + storage_client = storage.Client(project=gcs_project_id) + bucket = storage_client.bucket(gcs_bucket_name) + blob = bucket.blob(blob_name) + + if blob.exists(): + loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func) + pages = loader.load() + else : + raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.') else: - raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}') - return gcs_blob_filename, pages + creds= Credentials(access_token) + storage_client = storage.Client(project=gcs_project_id, credentials=creds) + + bucket = storage_client.bucket(gcs_bucket_name) + blob = bucket.blob(blob_name) + if blob.exists(): + content = blob.download_as_bytes() + pdf_file = io.BytesIO(content) + pdf_reader = PdfReader(pdf_file) + # Extract text from all pages + text = "" + for page in pdf_reader.pages: + text += page.extract_text() + pages = [Document(page_content = text)] + else: + raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}') + return gcs_blob_filename, pages def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed): try: diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py index 97aa7e33e..bfb945617 100644 --- a/backend/src/make_relationships.py +++ b/backend/src/make_relationships.py @@ -12,7 +12,6 @@ logging.basicConfig(format='%(asctime)s - %(message)s',level='INFO') EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL') -EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(EMBEDDING_MODEL) def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_documents_chunk_chunk_Id : list): batch_data = [] @@ -41,7 +40,7 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): isEmbedding = os.getenv('IS_EMBEDDING') - embeddings, dimension = EMBEDDING_FUNCTION , EMBEDDING_DIMENSION + embeddings, dimension = load_embedding_model(EMBEDDING_MODEL) logging.info(f'embedding model:{embeddings} and dimesion:{dimension}') data_for_query = [] logging.info(f"update embedding and vector index for chunks") @@ -161,6 +160,7 @@ def create_chunk_vector_index(graph): vector_index_query = "SHOW INDEXES YIELD name, type, labelsOrTypes, properties WHERE name = 'vector' AND type = 'VECTOR' AND 'Chunk' IN labelsOrTypes AND 'embedding' IN properties RETURN name" vector_index = execute_graph_query(graph,vector_index_query) if not vector_index: + EMBEDDING_FUNCTION , EMBEDDING_DIMENSION = load_embedding_model(EMBEDDING_MODEL) vector_store = Neo4jVector(embedding=EMBEDDING_FUNCTION, graph=graph, node_label="Chunk", diff --git a/backend/src/ragas_eval.py b/backend/src/ragas_eval.py index 251ab71c0..6092ae819 100644 --- a/backend/src/ragas_eval.py +++ b/backend/src/ragas_eval.py @@ -13,7 +13,13 @@ from ragas.embeddings import LangchainEmbeddingsWrapper import nltk -nltk.download('punkt') +nltk.data.path.append("/usr/local/nltk_data") +nltk.data.path.append(os.path.expanduser("~/.nltk_data")) +try: + nltk.data.find("tokenizers/punkt") +except LookupError: + nltk.download("punkt", download_dir=os.path.expanduser("~/.nltk_data")) + load_dotenv() EMBEDDING_MODEL = os.getenv("RAGAS_EMBEDDING_MODEL") diff --git a/backend/src/shared/common_fn.py b/backend/src/shared/common_fn.py index 6f394bb24..13beafdb6 100644 --- a/backend/src/shared/common_fn.py +++ b/backend/src/shared/common_fn.py @@ -1,7 +1,10 @@ import hashlib +import os +from transformers import AutoTokenizer, AutoModel +from langchain_huggingface import HuggingFaceEmbeddings +from threading import Lock import logging from src.document_sources.youtube import create_youtube_url -from langchain_huggingface import HuggingFaceEmbeddings from langchain_google_vertexai import VertexAIEmbeddings from langchain_openai import OpenAIEmbeddings from langchain_neo4j import Neo4jGraph @@ -16,6 +19,40 @@ import boto3 from langchain_community.embeddings import BedrockEmbeddings +MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +MODEL_PATH = "./local_model" +_lock = Lock() +_embedding_instance = None + +def ensure_sentence_transformer_model_downloaded(): + if os.path.isdir(MODEL_PATH): + print("Model already downloaded at:", MODEL_PATH) + return + else: + print("Downloading model to:", MODEL_PATH) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModel.from_pretrained(MODEL_NAME) + tokenizer.save_pretrained(MODEL_PATH) + model.save_pretrained(MODEL_PATH) + print("Model downloaded and saved.") + +def get_local_sentence_transformer_embedding(): + """ + Lazy, threadsafe singleton. Caller does not need to worry about + import-time initialization or download race. + """ + global _embedding_instance + if _embedding_instance is not None: + return _embedding_instance + with _lock: + if _embedding_instance is not None: + return _embedding_instance + # Ensure model is present before instantiating + ensure_sentence_transformer_model_downloaded() + _embedding_instance = HuggingFaceEmbeddings(model_name=MODEL_PATH) + print("Embedding model initialized.") + return _embedding_instance + def check_url_source(source_type, yt_url:str=None, wiki_query:str=None): language='' try: @@ -85,9 +122,8 @@ def load_embedding_model(embedding_model_name: str): dimension = 1536 logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}") else: - embeddings = HuggingFaceEmbeddings( - model_name="all-MiniLM-L6-v2"#, cache_folder="/embedding_model" - ) + # embeddings = HuggingFaceEmbeddings(model_name="./local_model") + embeddings = get_local_sentence_transformer_embedding() dimension = 384 logging.info(f"Embedding: Using Langchain HuggingFaceEmbeddings , Dimension:{dimension}") return embeddings, dimension diff --git a/docker-compose.yml b/docker-compose.yml index 4b166f490..bdc5069c4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,13 +7,9 @@ services: dockerfile: Dockerfile volumes: - ./backend:/code + env_file: + - ./backend/.env environment: - - NEO4J_URI=${NEO4J_URI-neo4j://database:7687} - - NEO4J_PASSWORD=${NEO4J_PASSWORD-password} - - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j} - - OPENAI_API_KEY=${OPENAI_API_KEY-} - - DIFFBOT_API_KEY=${DIFFBOT_API_KEY-} - - EMBEDDING_MODEL=${EMBEDDING_MODEL-all-MiniLM-L6-v2} - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-} - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-} - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT-} diff --git a/frontend/src/components/Layout/PageLayout.tsx b/frontend/src/components/Layout/PageLayout.tsx index a7dfa1c0a..d48328106 100644 --- a/frontend/src/components/Layout/PageLayout.tsx +++ b/frontend/src/components/Layout/PageLayout.tsx @@ -24,8 +24,6 @@ import { SKIP_AUTH } from '../../utils/Constants'; import { useNavigate } from 'react-router'; import { deduplicateByFullPattern, deduplicateNodeByValue } from '../../utils/Utils'; import DataImporterSchemaDialog from '../Popups/GraphEnhancementDialog/EnitityExtraction/DataImporter'; - - const GCSModal = lazy(() => import('../DataSources/GCS/GCSModal')); const S3Modal = lazy(() => import('../DataSources/AWS/S3Modal')); const GenericModal = lazy(() => import('../WebSources/GenericSourceModal')); diff --git a/frontend/src/components/Popups/GraphEnhancementDialog/EnitityExtraction/GraphPattern.tsx b/frontend/src/components/Popups/GraphEnhancementDialog/EnitityExtraction/GraphPattern.tsx index b6d9bd982..0f9a94cfb 100644 --- a/frontend/src/components/Popups/GraphEnhancementDialog/EnitityExtraction/GraphPattern.tsx +++ b/frontend/src/components/Popups/GraphEnhancementDialog/EnitityExtraction/GraphPattern.tsx @@ -38,6 +38,16 @@ const GraphPattern: React.FC = ({ }); const sourceRef = useRef(null); const { userCredentials } = useCredentials(); + const deduplicateOptions = (options: OptionType[]): OptionType[] => { + const seen = new Set(); + return options.filter((option) => { + if (seen.has(option.value)) { + return false; + } + seen.add(option.value); + return true; + }); + }; useEffect(() => { const isGlobalStateSet = @@ -64,17 +74,53 @@ const GraphPattern: React.FC = ({ target: { value: targetVal, label: targetVal }, }; }); - const savedSources: OptionType[] = Array.from(sourceSet).map((val) => ({ value: val, label: val })); const savedTypes: OptionType[] = Array.from(typeSet).map((val) => ({ value: val, label: val })); - const savedTargets: OptionType[] = Array.from(targetSet).map((val) => ({ value: val, label: val })); + const combinedSourceTarget = new Set([...sourceSet, ...targetSet]); + const combinedSourceTargetOptions: OptionType[] = Array.from(combinedSourceTarget).map((val) => ({ + value: val, + label: val, + })); + setSelectedRels(mappedRels); - setSourceOptions(savedSources); + setSourceOptions(combinedSourceTargetOptions); setTypeOptions(savedTypes); - setTargetOptions(savedTargets); + setTargetOptions(combinedSourceTargetOptions); } } }, []); + useEffect(() => { + let timeoutId: NodeJS.Timeout; + timeoutId = setTimeout(() => { + if (sourceOptions.length > 0) { + const deduped = deduplicateOptions(sourceOptions); + if (deduped.length !== sourceOptions.length) { + setSourceOptions(deduped); + } + } + + if (targetOptions.length > 0) { + const deduped = deduplicateOptions(targetOptions); + if (deduped.length !== targetOptions.length) { + setTargetOptions(deduped); + } + } + + if (typeOptions.length > 0) { + const deduped = deduplicateOptions(typeOptions); + if (deduped.length !== typeOptions.length) { + setTypeOptions(deduped); + } + } + }, 1000); + + return () => { + if (timeoutId) { + clearTimeout(timeoutId); + } + }; + }, []); + const handleNewValue = (newValue: string, type: 'source' | 'type' | 'target') => { const regex = /^[^,]*$/; if (!newValue.trim()) { @@ -92,8 +138,12 @@ const GraphPattern: React.FC = ({ } else { setShowWarning((old) => ({ ...old, [type]: { showError: false, errorMessage: '' } })); const newOption: OptionType = { value: newValue.trim(), label: newValue.trim() }; - const checkUniqueValue = (list: OptionType[], value: OptionType) => - (list.some((opt) => opt.value === value.value) ? list : [...list, value]); + const checkUniqueValue = (list: OptionType[], value: OptionType) => { + const exists = list.some((opt) => opt.value === value.value); + const updatedList = exists ? list : [...list, value]; + return deduplicateOptions(updatedList); + }; + switch (type) { case 'source': setSourceOptions((prev) => checkUniqueValue(prev, newOption)); @@ -110,7 +160,7 @@ const GraphPattern: React.FC = ({ onPatternChange(selectedSource as OptionType, selectedType as OptionType, newOption); break; default: - console.log('wrong type added'); + // Invalid type provided break; } setInputValues((prev) => ({ ...prev, [type]: '' }));