Skip to content

Commit

Permalink
feat: Add ability to track changes in files and embed accordingly
Browse files Browse the repository at this point in the history
This change checks the directory for the config folder, if not present, creates it. It then checks if files have already been embedded. If yes, then it checks for changes and re-embeds if necessary, else embeds the files.
  • Loading branch information
shxntanu committed Dec 22, 2024
1 parent 2c748ea commit a966bb8
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 16 deletions.
46 changes: 35 additions & 11 deletions lesa/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import typer
import time
import shutil
from datetime import datetime
from rich.text import Text
from rich.panel import Panel
from rich.table import Table
Expand Down Expand Up @@ -96,18 +97,41 @@ def embed():
console.print(f"[red]Error: Ollama server is not running![/red]")
console.print(f"Start the ollama server by running [cyan]lesa start[/cyan] command.")
raise typer.Exit(1)

with console.status("Creating configuration files...", spinner="earth") as status:

if not dm.check_configuration_folder():
console.print("[yellow]Configuration folder not found. Initializing...[/yellow]")
dm.init()
status.update("Extracting text from files...")
for file in dm.files:
console.log(f"Extracting text from {file}...")
docs = dm.extract_file_text(filepath=file)
console.log(f"Embedding documents from {file}...")
dm.embed_documents(docs)
console.log("Saving vector embeddings...")
dm.save_vector_store()
console.print("[green]Initialized configuration for embedding files[/green]")

if not dm.retrieve_config_key("embeddings_initialized"):
with console.status("Extracting text from files...", spinner="earth") as status:
for file in dm.files:
console.log(f"Extracting text from {file}...")
docs = dm.extract_file_text(filepath=file)
console.log(f"Embedding documents from {file}...")
dm.embed_documents(docs)
console.log("Saving vector embeddings...")
dm.save_vector_store()
dm.update_config_key("embeddings_initialized", True)
dm.update_config_key('embeddings_init_time', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
console.print("[green]Initialized configuration for embedding files[/green]")

else:
changes = dm.check_for_changes()
if len(changes.get('new_files')) == 0 and len(changes.get('modified_files')) == 0 and len(changes.get('deleted_files')) == 0:
console.print("[yellow]No changes detected in the directory[/yellow]")
else:
dm.scan_files()
console.print("[green]Files in the directory have been changed since last embedding. Embedding again...[/green]")
with console.status("Extracting text from files...", spinner="earth") as status:
for file in dm.files:
console.log(f"Extracting text from {file}...")
docs = dm.extract_file_text(filepath=file)
console.log(f"Embedding documents from {file}...")
dm.embed_documents(docs)
console.log("Saving vector embeddings...")
dm.save_vector_store()
dm.update_config_key('embeddings_init_time', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
console.print("[green]Embeddings updated successfully![/green]")

@app.command()
def read(file_path: str = typer.Argument(..., help="Path of the file to read")):
Expand Down
4 changes: 1 addition & 3 deletions lesa/core/conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

from lesa.core.documents import DocumentManager
from lesa.core.ollama import OllamaManager
from lesa.core.directory_manager import DirectoryManager

Expand All @@ -36,7 +35,6 @@ def __init__(self,

self.base_path = os.path.abspath(base_path)
self.embedding_path = os.path.join(self.base_path, self.CONFIG_DIR, 'embeddings')
self.document_manager = DocumentManager()
self.ollama_manager = OllamaManager()
self.embeddings_manager = DirectoryManager(document_model=document_model)

Expand Down Expand Up @@ -134,7 +132,7 @@ def embed_single_document_and_chat(self,
:return: List of embedded documents
"""

documents = self.document_manager.extract_document_text(file_path)
documents = self.embeddings_manager.extract_file_text(file_path)
docs = self.embeddings_manager.text_splitter.split_documents(documents=documents)

# Embed documents
Expand Down
41 changes: 39 additions & 2 deletions lesa/core/directory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def init(self) -> None:
file_metadata = self._scan_directory()

config_data = {
'initialized_at': datetime.now().isoformat(),
'initialized_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'embeddings_initialized': False,
'embeddings_init_time': None,
'base_path': self.base_path,
'files': file_metadata
}
Expand All @@ -169,7 +171,6 @@ def init(self) -> None:
json.dump(config_data, f, indent=4)

self.scan_files()
# print(f"Lesa embeddings initialized in {self.base_path}")

def _scan_directory(self, ignore_patterns: Optional[List[str]] = None) -> Dict[str, Dict]:
"""
Expand Down Expand Up @@ -233,6 +234,42 @@ def check_for_changes(self) -> Dict:

return changes

def check_configuration_folder(self) -> bool:
"""
Check if the configuration folder exists.
:return: True if the configuration folder exists
"""
if os.path.exists(self.config_path) and os.path.exists(self.config_file_path):
return True

def update_config_key(self, key: str, value: any) -> None:
"""
Update a specific key in the configuration file.
:param key: Key to update
:param value: New value for the key
"""
with open(self.config_file_path, 'r') as f:
config_data = json.load(f)

config_data[key] = value

with open(self.config_file_path, 'w') as f:
json.dump(config_data, f, indent=4)

def retrieve_config_key(self, key: str) -> any:
"""
Retrieve a specific key from the configuration file.
:param key: Key to retrieve
:return: Value of the key
"""
with open(self.config_file_path, 'r') as f:
config_data = json.load(f)

return config_data.get(key, None)

def update_configuration(self) -> None:
"""
Update the configuration after detecting changes.
Expand Down

0 comments on commit a966bb8

Please sign in to comment.